From 363ad94a04e7a428d47a3c77f5ee42762d536b9c Mon Sep 17 00:00:00 2001 From: Ryan Roelke Date: Mon, 21 Oct 2024 14:43:11 -0400 Subject: [PATCH] chore: [sc-57786] [rs] minimize set of functions which need to link with core tiledb to run (#179) --- Cargo.lock | 95 +- Cargo.toml | 29 +- test-utils/cells/Cargo.toml | 21 + test-utils/cells/src/field.rs | 446 +++ test-utils/cells/src/lib.rs | 995 +++++++ test-utils/cells/src/strategy.rs | 635 +++++ test-utils/cells/src/write/mod.rs | 399 +++ test-utils/cells/src/write/strategy.rs | 624 ++++ test-utils/proptest-config/Cargo.toml | 7 + test-utils/proptest-config/src/lib.rs | 122 + test-utils/signal/Cargo.toml | 8 + .../signal.rs => test-utils/signal/src/lib.rs | 0 test-utils/strategy-ext/Cargo.toml | 8 + .../strategy-ext/src/lib.rs | 0 .../strategy-ext/src}/meta.rs | 4 +- .../strategy-ext/src}/records.rs | 4 +- .../strategy-ext/src}/sequence.rs | 4 +- test-utils/uri/Cargo.toml | 9 + test-utils/uri/src/lib.rs | 22 + .../uri/src/tempdir.rs | 17 +- tiledb/api/Cargo.toml | 46 +- tiledb/api/examples/aggregates.rs | 2 +- tiledb/api/examples/fragment_info.rs | 2 +- tiledb/api/examples/groups.rs | 2 +- tiledb/api/examples/multi_range_subarray.rs | 65 +- tiledb/api/examples/query_condition_dense.rs | 2 + tiledb/api/examples/query_condition_sparse.rs | 2 + tiledb/api/examples/quickstart_dense.rs | 2 +- .../api/examples/quickstart_sparse_string.rs | 65 +- tiledb/api/examples/reading_incomplete.rs | 10 +- tiledb/api/examples/using_tiledb_stats.rs | 2 +- tiledb/api/src/array/attribute/arrow.rs | 32 +- tiledb/api/src/array/attribute/mod.rs | 376 +-- tiledb/api/src/array/attribute/pod.rs | 105 + tiledb/api/src/array/dimension/arrow.rs | 22 +- tiledb/api/src/array/dimension/mod.rs | 649 +---- tiledb/api/src/array/dimension/pod.rs | 67 + tiledb/api/src/array/domain/mod.rs | 167 +- tiledb/api/src/array/domain/pod.rs | 40 + tiledb/api/src/array/enumeration/mod.rs | 134 +- tiledb/api/src/array/enumeration/pod.rs | 57 + tiledb/api/src/array/fragment_info.rs | 54 +- tiledb/api/src/array/mod.rs | 301 +- tiledb/api/src/array/schema/arrow.rs | 9 +- tiledb/api/src/array/schema/mod.rs | 669 ++--- tiledb/api/src/array/schema/pod.rs | 95 + tiledb/api/src/array/strategy.rs | 49 - tiledb/api/src/datatype/arrow.rs | 1043 +------ tiledb/api/src/datatype/mod.rs | 922 +----- tiledb/api/src/datatype/strategy.rs | 338 --- tiledb/api/src/error.rs | 176 +- tiledb/api/src/filter/arrow.rs | 6 +- tiledb/api/src/filter/ftype.rs | 135 +- tiledb/api/src/filter/list.rs | 72 - tiledb/api/src/filter/mod.rs | 429 +-- tiledb/api/src/filter/option.rs | 30 +- tiledb/api/src/filter/pod.rs | 62 + tiledb/api/src/group/mod.rs | 24 +- tiledb/api/src/lib.rs | 40 +- tiledb/api/src/metadata.rs | 395 +-- tiledb/api/src/query/buffer/mod.rs | 5 +- tiledb/api/src/query/conditions.rs | 38 +- tiledb/api/src/query/mod.rs | 53 +- tiledb/api/src/query/read/aggregate/mod.rs | 115 +- tiledb/api/src/query/read/callback.rs | 17 +- tiledb/api/src/query/read/mod.rs | 8 +- tiledb/api/src/query/read/output/arrow.rs | 11 +- tiledb/api/src/query/read/output/mod.rs | 70 +- tiledb/api/src/query/read/typed.rs | 7 +- tiledb/api/src/query/strategy.rs | 2527 ++++------------- tiledb/api/src/query/subarray.rs | 398 +-- tiledb/api/src/query/write/input/arrow.rs | 94 +- tiledb/api/src/query/write/input/mod.rs | 37 +- tiledb/api/src/query/write/mod.rs | 3 - tiledb/api/src/query/write/strategy.rs | 1639 ----------- tiledb/api/src/stats.rs | 8 +- tiledb/api/src/strategy.rs | 126 - tiledb/api/src/tests/examples/mod.rs | 7 +- tiledb/api/src/tests/examples/sparse_all.rs | 13 +- tiledb/api/src/tests/mod.rs | 35 +- tiledb/api/src/vfs.rs | 38 +- tiledb/common/Cargo.toml | 27 + tiledb/common/src/array/attribute.rs | 224 ++ tiledb/common/src/array/dimension.rs | 255 ++ tiledb/common/src/array/mod.rs | 385 +++ tiledb/common/src/datatype/arrow.rs | 1074 +++++++ .../{api => common}/src/datatype/logical.rs | 26 - tiledb/common/src/datatype/mod.rs | 1063 +++++++ .../{api => common}/src/datatype/physical.rs | 55 +- tiledb/common/src/datatype/strategy.rs | 167 ++ tiledb/common/src/filter/mod.rs | 211 ++ tiledb/{api => common}/src/filter/webp.rs | 47 +- tiledb/{api => common}/src/key.rs | 0 tiledb/common/src/lib.rs | 33 + tiledb/common/src/metadata.rs | 289 ++ tiledb/{api => common}/src/range.rs | 697 +++-- tiledb/common/src/vfs.rs | 40 + tiledb/pod/Cargo.toml | 33 + tiledb/pod/src/array/attribute/mod.rs | 34 + .../src/array/attribute/strategy.rs | 102 +- tiledb/pod/src/array/dimension/mod.rs | 34 + .../src/array/dimension/strategy.rs | 240 +- tiledb/pod/src/array/domain/mod.rs | 54 + .../{api => pod}/src/array/domain/strategy.rs | 61 +- tiledb/pod/src/array/enumeration/mod.rs | 23 + .../src/array/enumeration/strategy.rs | 37 +- tiledb/pod/src/array/mod.rs | 5 + tiledb/pod/src/array/schema/mod.rs | 183 ++ .../{api => pod}/src/array/schema/strategy.rs | 124 +- tiledb/pod/src/filter/mod.rs | 2 + tiledb/{api => pod}/src/filter/strategy.rs | 147 +- tiledb/pod/src/lib.rs | 36 + tiledb/pod/src/query/mod.rs | 1 + tiledb/pod/src/query/subarray.rs | 378 +++ tiledb/queries/Cargo.toml | 3 +- tiledb/queries/examples/aggregate-adapters.rs | 139 +- tiledb/queries/src/aggregate.rs | 14 +- tiledb/queries/src/lib.rs | 2 - tiledb/sys/ignored.rs | 20 +- tiledb/sys/src/datatype.rs | 16 - tiledb/sys/src/lib.rs | 2 - tiledb/test-utils/Cargo.toml | 15 - tiledb/test-utils/src/lib.rs | 11 - tiledb/utils/src/option.rs | 11 +- tools/api-coverage/src/main.rs | 2 +- 125 files changed, 11239 insertions(+), 10479 deletions(-) create mode 100644 test-utils/cells/Cargo.toml create mode 100644 test-utils/cells/src/field.rs create mode 100644 test-utils/cells/src/lib.rs create mode 100644 test-utils/cells/src/strategy.rs create mode 100644 test-utils/cells/src/write/mod.rs create mode 100644 test-utils/cells/src/write/strategy.rs create mode 100644 test-utils/proptest-config/Cargo.toml create mode 100644 test-utils/proptest-config/src/lib.rs create mode 100644 test-utils/signal/Cargo.toml rename tiledb/test-utils/src/signal.rs => test-utils/signal/src/lib.rs (100%) create mode 100644 test-utils/strategy-ext/Cargo.toml rename tiledb/test-utils/src/strategy/mod.rs => test-utils/strategy-ext/src/lib.rs (100%) rename {tiledb/test-utils/src/strategy => test-utils/strategy-ext/src}/meta.rs (98%) rename {tiledb/test-utils/src/strategy => test-utils/strategy-ext/src}/records.rs (99%) rename {tiledb/test-utils/src/strategy => test-utils/strategy-ext/src}/sequence.rs (96%) create mode 100644 test-utils/uri/Cargo.toml create mode 100644 test-utils/uri/src/lib.rs rename tiledb/test-utils/src/uri_generators.rs => test-utils/uri/src/tempdir.rs (73%) create mode 100644 tiledb/api/src/array/attribute/pod.rs create mode 100644 tiledb/api/src/array/dimension/pod.rs create mode 100644 tiledb/api/src/array/domain/pod.rs create mode 100644 tiledb/api/src/array/enumeration/pod.rs create mode 100644 tiledb/api/src/array/schema/pod.rs delete mode 100644 tiledb/api/src/array/strategy.rs delete mode 100644 tiledb/api/src/datatype/strategy.rs create mode 100644 tiledb/api/src/filter/pod.rs delete mode 100644 tiledb/api/src/query/write/strategy.rs delete mode 100644 tiledb/api/src/strategy.rs create mode 100644 tiledb/common/Cargo.toml create mode 100644 tiledb/common/src/array/attribute.rs create mode 100644 tiledb/common/src/array/dimension.rs create mode 100644 tiledb/common/src/array/mod.rs create mode 100644 tiledb/common/src/datatype/arrow.rs rename tiledb/{api => common}/src/datatype/logical.rs (91%) create mode 100644 tiledb/common/src/datatype/mod.rs rename tiledb/{api => common}/src/datatype/physical.rs (88%) create mode 100644 tiledb/common/src/datatype/strategy.rs create mode 100644 tiledb/common/src/filter/mod.rs rename tiledb/{api => common}/src/filter/webp.rs (65%) rename tiledb/{api => common}/src/key.rs (100%) create mode 100644 tiledb/common/src/lib.rs create mode 100644 tiledb/common/src/metadata.rs rename tiledb/{api => common}/src/range.rs (79%) create mode 100644 tiledb/common/src/vfs.rs create mode 100644 tiledb/pod/Cargo.toml create mode 100644 tiledb/pod/src/array/attribute/mod.rs rename tiledb/{api => pod}/src/array/attribute/strategy.rs (73%) create mode 100644 tiledb/pod/src/array/dimension/mod.rs rename tiledb/{api => pod}/src/array/dimension/strategy.rs (53%) create mode 100644 tiledb/pod/src/array/domain/mod.rs rename tiledb/{api => pod}/src/array/domain/strategy.rs (81%) create mode 100644 tiledb/pod/src/array/enumeration/mod.rs rename tiledb/{api => pod}/src/array/enumeration/strategy.rs (72%) create mode 100644 tiledb/pod/src/array/mod.rs create mode 100644 tiledb/pod/src/array/schema/mod.rs rename tiledb/{api => pod}/src/array/schema/strategy.rs (77%) create mode 100644 tiledb/pod/src/filter/mod.rs rename tiledb/{api => pod}/src/filter/strategy.rs (81%) create mode 100644 tiledb/pod/src/lib.rs create mode 100644 tiledb/pod/src/query/mod.rs create mode 100644 tiledb/pod/src/query/subarray.rs delete mode 100644 tiledb/sys/src/datatype.rs delete mode 100644 tiledb/test-utils/Cargo.toml delete mode 100644 tiledb/test-utils/src/lib.rs diff --git a/Cargo.lock b/Cargo.lock index fa2655ce..675c7273 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -90,9 +90,9 @@ dependencies = [ [[package]] name = "anyhow" -version = "1.0.81" +version = "1.0.89" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0952808a6c2afd1aa8947271f3a60f1a6763c7b912d210184c5149b5cf147247" +checksum = "86fdf8605db99b54d3cd748a44c6d04df638eb5dafb219b135d0149bd0db01f6" [[package]] name = "api-coverage" @@ -415,6 +415,18 @@ version = "1.0.90" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8cd6604a82acf3039f1144f54b8eb34e91ffba622051189e71b781822d5ee1f5" +[[package]] +name = "cells" +version = "0.1.0" +dependencies = [ + "paste", + "proptest", + "strategy-ext", + "tiledb-common", + "tiledb-pod", + "tiledb-proptest-config", +] + [[package]] name = "cexpr" version = "0.6.0" @@ -1201,12 +1213,26 @@ version = "1.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64" +[[package]] +name = "signal" +version = "0.1.0" +dependencies = [ + "nix", +] + [[package]] name = "static_assertions" version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a2eb9349b6444b326872e140eb1cf5e7c522154d69e7a0ffb0fb81c06b37543f" +[[package]] +name = "strategy-ext" +version = "0.1.0" +dependencies = [ + "proptest", +] + [[package]] name = "strsim" version = "0.11.1" @@ -1276,11 +1302,12 @@ dependencies = [ ] [[package]] -name = "tiledb" +name = "tiledb-api" version = "0.1.0" dependencies = [ "anyhow", "arrow", + "cells", "itertools", "num-traits", "paste", @@ -1289,9 +1316,44 @@ dependencies = [ "serde", "serde_json", "thiserror", + "tiledb-common", + "tiledb-pod", + "tiledb-sys", + "tiledb-utils", + "uri", +] + +[[package]] +name = "tiledb-common" +version = "0.1.0" +dependencies = [ + "anyhow", + "arrow-schema", + "paste", + "proptest", + "serde", + "serde_json", + "thiserror", "tiledb-proc-macro", "tiledb-sys", - "tiledb-test-utils", + "tiledb-utils", +] + +[[package]] +name = "tiledb-pod" +version = "0.1.0" +dependencies = [ + "itertools", + "num-traits", + "proptest", + "serde", + "serde_json", + "strategy-ext", + "thiserror", + "tiledb-common", + "tiledb-proc-macro", + "tiledb-proptest-config", + "tiledb-sys", "tiledb-utils", ] @@ -1304,12 +1366,17 @@ dependencies = [ "syn", ] +[[package]] +name = "tiledb-proptest-config" +version = "0.1.0" + [[package]] name = "tiledb-query-adapters" version = "0.1.0" dependencies = [ "pkg-config", - "tiledb", + "tiledb-api", + "tiledb-common", "tiledb-utils", ] @@ -1321,16 +1388,6 @@ dependencies = [ "tiledb-utils", ] -[[package]] -name = "tiledb-test-utils" -version = "0.1.0" -dependencies = [ - "anyhow", - "nix", - "proptest", - "tempfile", -] - [[package]] name = "tiledb-utils" version = "0.1.0" @@ -1368,6 +1425,14 @@ version = "0.1.11" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e51733f11c9c4f72aa0c160008246859e340b00807569a0da0e7a1079b27ba85" +[[package]] +name = "uri" +version = "0.1.0" +dependencies = [ + "anyhow", + "tempfile", +] + [[package]] name = "utf8parse" version = "0.2.1" diff --git a/Cargo.toml b/Cargo.toml index 00e92f29..81353572 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -2,15 +2,23 @@ resolver = "2" members = [ "tiledb/api", + "tiledb/common", + "tiledb/pod", "tiledb/proc-macro", "tiledb/queries", "tiledb/sys", - "tiledb/test-utils", "tiledb/utils", + "test-utils/cells", + "test-utils/proptest-config", + "test-utils/signal", + "test-utils/strategy-ext", + "test-utils/uri", "tools/api-coverage" ] default-members = [ "tiledb/api", + "tiledb/common", + "tiledb/pod", "tiledb/proc-macro", "tiledb/queries", "tiledb/sys", @@ -24,11 +32,26 @@ version = "0.1.0" [workspace.dependencies] anyhow = "1.0" +arrow = { version = "52.0.0", features = ["prettyprint"] } +arrow-schema = { version = "52.0.0" } +cells = { path = "test-utils/cells", version = "0.1.0" } +itertools = "0" +num-traits = "0.2" +paste = "1.0" proptest = { version = "1.0.0" } -serde_json = { version = "1.0.114", features = ["float_roundtrip"] } -tiledb = { path = "tiledb/api", version = "0.1.0" } +serde = { version = "1", features = ["derive"] } +serde_json = { version = "1", features = ["float_roundtrip"] } +signal = { path = "test-utils/signal", version = "0.1.0" } +strategy-ext = { path = "test-utils/strategy-ext", version = "0.1.0" } +tempfile = { version = "3" } +thiserror = { version = "1" } +tiledb-api = { path = "tiledb/api", version = "0.1.0" } +tiledb-common = { path = "tiledb/common", version = "0.1.0" } +tiledb-pod = { path = "tiledb/pod", version = "0.1.0" } tiledb-proc-macro = { path = "tiledb/proc-macro", version = "0.1.0" } +tiledb-proptest-config = { path = "test-utils/proptest-config", version = "0.1.0" } tiledb-sys = { path = "tiledb/sys", version = "0.1.0" } tiledb-test-utils = { path = "tiledb/test-utils", version = "0.1.0" } tiledb-utils = { path = "tiledb/utils", version = "0.1.0" } pkg-config = "0.3.30" +uri = { path = "test-utils/uri", version = "0.1.0" } diff --git a/test-utils/cells/Cargo.toml b/test-utils/cells/Cargo.toml new file mode 100644 index 00000000..4d29ba39 --- /dev/null +++ b/test-utils/cells/Cargo.toml @@ -0,0 +1,21 @@ +[package] +name = "cells" +edition.workspace = true +rust-version.workspace = true +version.workspace = true + +[dependencies] +paste = { workspace = true } +proptest = { workspace = true } +strategy-ext = { workspace = true } +tiledb-common = { workspace = true } +tiledb-pod = { workspace = true } +tiledb-proptest-config = { workspace = true, optional = true } + +[dev-dependencies] +tiledb-common = { workspace = true, features = ["proptest-strategies"] } +tiledb-pod = { workspace = true, features = ["proptest-strategies"] } + +[features] +default = [] +proptest-strategies = ["dep:tiledb-proptest-config", "tiledb-common/proptest-strategies", "tiledb-pod/proptest-strategies"] diff --git a/test-utils/cells/src/field.rs b/test-utils/cells/src/field.rs new file mode 100644 index 00000000..dba48075 --- /dev/null +++ b/test-utils/cells/src/field.rs @@ -0,0 +1,446 @@ +use paste::paste; +use proptest::bits::{BitSetLike, VarBitSet}; +use strategy_ext::records::Records; +use tiledb_common::datatype::physical::{BitsEq, BitsOrd}; +use tiledb_common::datatype::Error as DatatypeError; + +/// Represents the write query input for a single field. +/// +/// For each variant, the outer Vec is the collection of records, and the interior is value in the +/// cell for the record. Fields with cell val num of 1 are flat, and other cell values use the +/// inner Vec. For fixed-size attributes, the inner Vecs shall all have the same length; for +/// var-sized attributes that is obviously not required. +#[derive(Clone, Debug, PartialEq)] +pub enum FieldData { + UInt8(Vec), + UInt16(Vec), + UInt32(Vec), + UInt64(Vec), + Int8(Vec), + Int16(Vec), + Int32(Vec), + Int64(Vec), + Float32(Vec), + Float64(Vec), + VecUInt8(Vec>), + VecUInt16(Vec>), + VecUInt32(Vec>), + VecUInt64(Vec>), + VecInt8(Vec>), + VecInt16(Vec>), + VecInt32(Vec>), + VecInt64(Vec>), + VecFloat32(Vec>), + VecFloat64(Vec>), +} + +#[macro_export] +macro_rules! typed_field_data { + ($($V:ident : $U:ty),+) => { + $( + impl From> for FieldData { + fn from(value: Vec<$U>) -> Self { + FieldData::$V(value) + } + } + + impl From>> for FieldData { + fn from(value: Vec>) -> Self { + paste! { + FieldData::[< Vec $V >](value) + } + } + } + + impl TryFrom for Vec<$U> { + type Error = DatatypeError; + + fn try_from(value: FieldData) -> Result { + if let FieldData::$V(values) = value { + Ok(values) + } else { + $crate::typed_field_data_go!(value, DT, _, + { + Err(DatatypeError::physical_type_mismatch::<$U, DT>()) + }, + { + Err(DatatypeError::physical_type_mismatch::<$U, Vec
>()) + }) + } + } + } + )+ + }; +} + +typed_field_data!(UInt8: u8, UInt16: u16, UInt32: u32, UInt64: u64); +typed_field_data!(Int8: i8, Int16: i16, Int32: i32, Int64: i64); +typed_field_data!(Float32: f32, Float64: f64); + +impl From> for FieldData { + fn from(value: Vec) -> Self { + FieldData::from( + value + .into_iter() + .map(|s| s.into_bytes()) + .collect::>>(), + ) + } +} + +impl Records for FieldData { + fn len(&self) -> usize { + self.len() + } + + fn filter(&self, subset: &VarBitSet) -> Self { + self.filter(subset) + } +} + +/// Applies a generic expression to the interior of a `FieldData` value. +/// +/// The first form of this macro applies the same expression to all variants. +/// The second form enables applying a different expression to the forms +/// with an interior `Vec
` versus `Vec>`. +/// The third form enables applying a different expression to the forms +/// with an interior `Vec
` versus `Vec` versus `Vec>` versus `Vec>`, +/// where `DT` is an integral type and `FT` is a floating-point type. +/// +/// # Examples +/// ``` +/// use cells::field::FieldData; +/// use cells::typed_field_data_go; +/// +/// fn dedup_cells(cells: &mut FieldData) { +/// typed_field_data_go!(cells, ref mut cells_interior, cells_interior.dedup()) +/// } +/// let mut cells = FieldData::UInt64(vec![1, 2, 2, 3, 2]); +/// dedup_cells(&mut cells); +/// assert_eq!(cells, FieldData::UInt64(vec![1, 2, 3, 2])); +/// ``` +#[macro_export] +macro_rules! typed_field_data_go { + ($field:expr, $data:pat, $then:expr) => { + $crate::typed_field_data_go!($field, _DT, $data, $then, $then) + }; + ($field:expr, $DT:ident, $data:pat, $fixed:expr, $var:expr) => { + $crate::typed_field_data_go!( + $field, $DT, $data, $fixed, $var, $fixed, $var + ) + }; + ($field:expr, $DT:ident, $data:pat, $integral_fixed:expr, $integral_var:expr, $float_fixed:expr, $float_var:expr) => {{ + use $crate::field::FieldData; + match $field { + FieldData::UInt8($data) => { + type $DT = u8; + $integral_fixed + } + FieldData::UInt16($data) => { + type $DT = u16; + $integral_fixed + } + FieldData::UInt32($data) => { + type $DT = u32; + $integral_fixed + } + FieldData::UInt64($data) => { + type $DT = u64; + $integral_fixed + } + FieldData::Int8($data) => { + type $DT = i8; + $integral_fixed + } + FieldData::Int16($data) => { + type $DT = i16; + $integral_fixed + } + FieldData::Int32($data) => { + type $DT = i32; + $integral_fixed + } + FieldData::Int64($data) => { + type $DT = i64; + $integral_fixed + } + FieldData::Float32($data) => { + type $DT = f32; + $float_fixed + } + FieldData::Float64($data) => { + type $DT = f64; + $float_fixed + } + FieldData::VecUInt8($data) => { + type $DT = u8; + $integral_var + } + FieldData::VecUInt16($data) => { + type $DT = u16; + $integral_var + } + FieldData::VecUInt32($data) => { + type $DT = u32; + $integral_var + } + FieldData::VecUInt64($data) => { + type $DT = u64; + $integral_var + } + FieldData::VecInt8($data) => { + type $DT = i8; + $integral_var + } + FieldData::VecInt16($data) => { + type $DT = i16; + $integral_var + } + FieldData::VecInt32($data) => { + type $DT = i32; + $integral_var + } + FieldData::VecInt64($data) => { + type $DT = i64; + $integral_var + } + FieldData::VecFloat32($data) => { + type $DT = f32; + $float_var + } + FieldData::VecFloat64($data) => { + type $DT = f64; + $float_var + } + } + }}; +} + +/// Applies a generic expression to the interiors of two `FieldData` values with matching variants, +/// i.e. with the same physical data type. Typical usage is for comparing the insides of the two +/// `FieldData` values. +#[macro_export] +macro_rules! typed_field_data_cmp { + ($lexpr:expr, $rexpr:expr, $DT:ident, $lpat:pat, $rpat:pat, $same_type:expr, $else:expr) => {{ + use $crate::field::FieldData; + match ($lexpr, $rexpr) { + (FieldData::UInt8($lpat), FieldData::UInt8($rpat)) => { + type $DT = u8; + $same_type + } + (FieldData::UInt16($lpat), FieldData::UInt16($rpat)) => { + type $DT = u16; + $same_type + } + (FieldData::UInt32($lpat), FieldData::UInt32($rpat)) => { + type $DT = u32; + $same_type + } + (FieldData::UInt64($lpat), FieldData::UInt64($rpat)) => { + type $DT = u64; + $same_type + } + (FieldData::Int8($lpat), FieldData::Int8($rpat)) => { + type $DT = i8; + $same_type + } + (FieldData::Int16($lpat), FieldData::Int16($rpat)) => { + type $DT = i16; + $same_type + } + (FieldData::Int32($lpat), FieldData::Int32($rpat)) => { + type $DT = i32; + $same_type + } + (FieldData::Int64($lpat), FieldData::Int64($rpat)) => { + type $DT = i64; + $same_type + } + (FieldData::Float32($lpat), FieldData::Float32($rpat)) => { + type $DT = f32; + $same_type + } + (FieldData::Float64($lpat), FieldData::Float64($rpat)) => { + type $DT = f64; + $same_type + } + (FieldData::VecUInt8($lpat), FieldData::VecUInt8($rpat)) => { + type $DT = u8; + $same_type + } + (FieldData::VecUInt16($lpat), FieldData::VecUInt16($rpat)) => { + type $DT = u16; + $same_type + } + (FieldData::VecUInt32($lpat), FieldData::VecUInt32($rpat)) => { + type $DT = u32; + $same_type + } + (FieldData::VecUInt64($lpat), FieldData::VecUInt64($rpat)) => { + type $DT = u64; + $same_type + } + (FieldData::VecInt8($lpat), FieldData::VecInt8($rpat)) => { + type $DT = i8; + $same_type + } + (FieldData::VecInt16($lpat), FieldData::VecInt16($rpat)) => { + type $DT = i16; + $same_type + } + (FieldData::VecInt32($lpat), FieldData::VecInt32($rpat)) => { + type $DT = i32; + $same_type + } + (FieldData::VecInt64($lpat), FieldData::VecInt64($rpat)) => { + type $DT = i64; + $same_type + } + (FieldData::VecFloat32($lpat), FieldData::VecFloat32($rpat)) => { + type $DT = f32; + $same_type + } + (FieldData::VecFloat64($lpat), FieldData::VecFloat64($rpat)) => { + type $DT = f64; + $same_type + } + _ => $else, + } + }}; +} + +impl FieldData { + pub fn is_empty(&self) -> bool { + typed_field_data_go!(self, v, v.is_empty()) + } + + pub fn len(&self) -> usize { + typed_field_data_go!(self, v, v.len()) + } + + /// Returns the number of null values. + /// + /// At this time, values in `FieldData` are not nullable, so this is always zero. + pub fn null_count(&self) -> usize { + 0 + } + + pub fn is_cell_single(&self) -> bool { + typed_field_data_go!(self, _DT, _, true, false) + } + + pub fn slice(&self, start: usize, len: usize) -> FieldData { + typed_field_data_go!(self, ref values, { + FieldData::from(values[start..start + len].to_vec().clone()) + }) + } + + pub fn filter(&self, set: &VarBitSet) -> FieldData { + typed_field_data_go!(self, ref values, { + FieldData::from( + values + .clone() + .into_iter() + .enumerate() + .filter(|&(i, _)| set.test(i)) + .map(|(_, e)| e) + .collect::>(), + ) + }) + } + + pub fn truncate(&mut self, len: usize) { + typed_field_data_go!(self, ref mut data, data.truncate(len)) + } + + pub fn sort(&mut self) { + typed_field_data_go!( + self, + DT, + ref mut data, + { + let cmp = |k1: &DT, k2: &DT| k1.bits_cmp(k2); + data.sort_by(cmp) + }, + { + let cmp = |k1: &Vec
, k2: &Vec
| k1.bits_cmp(k2); + data.sort_by(cmp) + } + ); + } + + pub fn extend(&mut self, other: Self) { + typed_field_data_cmp!( + self, + other, + _DT, + ref mut data, + other_data, + { + // the field types match + data.extend(other_data); + }, + { + // if they do not match + panic!("Field types do not match in `FieldData::extend`") + } + ) + } +} + +impl BitsEq for FieldData { + fn bits_eq(&self, other: &Self) -> bool { + typed_field_data_cmp!( + self, + other, + _DT, + ref data, + ref other_data, + data.bits_eq(other_data), // match + false // fields do not match + ) + } +} + +#[cfg(test)] +mod tests { + use proptest::prelude::*; + use tiledb_common::array::CellValNum; + use tiledb_common::datatype::Datatype; + + use super::*; + use crate::strategy::{FieldDataParameters, FieldStrategyDatatype}; + + fn do_field_data_extend(dst: FieldData, src: FieldData) { + let orig_dst = dst.clone(); + let orig_src = src.clone(); + + let mut dst = dst; + dst.extend(src); + + typed_field_data_go!(dst, dst, { + assert_eq!( + orig_dst, + FieldData::from(dst[0..orig_dst.len()].to_vec()) + ); + assert_eq!( + orig_src, + FieldData::from(dst[orig_dst.len()..dst.len()].to_vec()) + ); + assert_eq!(dst.len(), orig_dst.len() + orig_src.len()); + }) + } + + proptest! { + #[test] + fn field_data_extend((dst, src) in (any::(), any::()).prop_flat_map(|(dt, cvn)| { + let params = FieldDataParameters { + datatype: Some(FieldStrategyDatatype::Datatype(dt, cvn)), + ..Default::default() + }; + (any_with::(params.clone()), any_with::(params.clone())) + })) { + do_field_data_extend(dst, src) + } + } +} diff --git a/test-utils/cells/src/lib.rs b/test-utils/cells/src/lib.rs new file mode 100644 index 00000000..295c6953 --- /dev/null +++ b/test-utils/cells/src/lib.rs @@ -0,0 +1,995 @@ +pub mod field; +pub mod write; + +#[cfg(any(test, feature = "proptest-strategies"))] +pub mod strategy; + +use std::cmp::Ordering; +use std::collections::hash_map::Entry; +use std::collections::HashMap; +use std::fmt::Debug; +use std::ops::Range; + +use proptest::bits::{BitSetLike, VarBitSet}; + +use tiledb_common::datatype::physical::{BitsEq, BitsOrd}; + +pub use self::field::FieldData; + +#[derive(Clone, Debug, PartialEq)] +pub struct Cells { + fields: HashMap, +} + +impl Cells { + /// # Panics + /// + /// Panics if the fields do not all have the same number of cells. + pub fn new(fields: HashMap) -> Self { + let mut expect_len: Option = None; + for (_, d) in fields.iter() { + if let Some(expect_len) = expect_len { + assert_eq!(d.len(), expect_len); + } else { + expect_len = Some(d.len()) + } + } + + Cells { fields } + } + + pub fn is_empty(&self) -> bool { + self.fields.values().next().unwrap().is_empty() + } + + pub fn len(&self) -> usize { + self.fields.values().next().unwrap().len() + } + + pub fn fields(&self) -> &HashMap { + &self.fields + } + + /// Copies data from the argument. + /// Overwrites data at common indices and extends `self` where necessary. + pub fn copy_from(&mut self, cells: Self) { + for (field, data) in cells.fields.into_iter() { + match self.fields.entry(field) { + Entry::Vacant(v) => { + v.insert(data); + } + Entry::Occupied(mut o) => { + let prev_write_data = o.get_mut(); + typed_field_data_cmp!( + prev_write_data, + data, + _DT, + ref mut mine, + theirs, + { + if mine.len() <= theirs.len() { + *mine = theirs; + } else { + mine[0..theirs.len()] + .clone_from_slice(theirs.as_slice()); + } + }, + unreachable!() + ); + } + } + } + } + + /// Shortens the cells, keeping the first `len` records and dropping the rest. + pub fn truncate(&mut self, len: usize) { + for data in self.fields.values_mut() { + data.truncate(len) + } + } + + /// Extends this cell data with the contents of another. + /// + /// # Panics + /// + /// Panics if the set of fields in `self` and `other` do not match. + /// + /// Panics if any field in `self` and `other` has a different type. + pub fn extend(&mut self, other: Self) { + let mut other = other; + for (field, data) in self.fields.iter_mut() { + let other_data = other.fields.remove(field).unwrap(); + data.extend(other_data); + } + assert_eq!(other.fields.len(), 0); + } + + /// Returns a view over a slice of the cells, + /// with a subset of the fields viewed as indicated by `keys`. + /// This is useful for comparing a section of `self` to another `Cells` instance. + pub fn view<'a>( + &'a self, + keys: &'a [String], + slice: Range, + ) -> CellsView<'a> { + for k in keys.iter() { + if !self.fields.contains_key(k) { + panic!("Cannot construct view: key '{}' not found (fields are {:?})", + k, self.fields.keys()) + } + } + + CellsView { + cells: self, + keys, + slice, + } + } + + /// Returns a comparator for ordering indices into the cells. + fn index_comparator<'a>( + &'a self, + keys: &'a [String], + ) -> impl Fn(&usize, &usize) -> Ordering + 'a { + move |l: &usize, r: &usize| -> Ordering { + for key in keys.iter() { + typed_field_data_go!(self.fields[key], ref data, { + match BitsOrd::bits_cmp(&data[*l], &data[*r]) { + Ordering::Less => return Ordering::Less, + Ordering::Greater => return Ordering::Greater, + Ordering::Equal => continue, + } + }) + } + Ordering::Equal + } + } + + /// Returns whether the cells are sorted according to `keys`. See `Self::sort`. + pub fn is_sorted(&self, keys: &[String]) -> bool { + let index_comparator = self.index_comparator(keys); + for i in 1..self.len() { + if index_comparator(&(i - 1), &i) == Ordering::Greater { + return false; + } + } + true + } + + /// Sorts the cells using `keys`. If two elements are equal on the first item in `keys`, + /// then they will be ordered using the second; and so on. + /// May not preserve the order of elements which are equal for all fields in `keys`. + pub fn sort(&mut self, keys: &[String]) { + let mut idx = std::iter::repeat(()) + .take(self.len()) + .enumerate() + .map(|(i, _)| i) + .collect::>(); + + let idx_comparator = self.index_comparator(keys); + idx.sort_by(idx_comparator); + + for data in self.fields.values_mut() { + typed_field_data_go!(data, ref mut data, { + let mut unsorted = std::mem::replace( + data, + vec![Default::default(); data.len()], + ); + for i in 0..unsorted.len() { + data[i] = std::mem::take(&mut unsorted[idx[i]]); + } + }); + } + } + + /// Returns a copy of the cells, sorted as if by `self.sort()`. + pub fn sorted(&self, keys: &[String]) -> Self { + let mut sorted = self.clone(); + sorted.sort(keys); + sorted + } + + /// Returns the list of offsets beginning each group, i.e. run of contiguous values on `keys`. + /// + /// This is best used with sorted cells, but that is not required. + /// For each pair of offsets in the output, all cells in that index range are equal; + /// and the adjacent cells outside of the range are not equal. + pub fn identify_groups(&self, keys: &[String]) -> Option> { + if self.is_empty() { + return None; + } + let mut groups = vec![0]; + let mut icmp = 0; + for i in 1..self.len() { + let distinct = keys.iter().any(|k| { + let v = self.fields().get(k).unwrap(); + typed_field_data_go!( + v, + ref cells, + cells[i].bits_ne(&cells[icmp]) + ) + }); + if distinct { + groups.push(i); + icmp = i; + } + } + groups.push(self.len()); + Some(groups) + } + + /// Returns the number of distinct values grouped on `keys` + pub fn count_distinct(&self, keys: &[String]) -> usize { + if self.len() <= 1 { + return self.len(); + } + + let key_cells = { + let key_fields = self + .fields + .iter() + .filter(|(k, _)| keys.contains(k)) + .map(|(k, v)| (k.clone(), v.clone())) + .collect::>(); + Cells::new(key_fields).sorted(keys) + }; + + let mut icmp = 0; + let mut count = 1; + + for i in 1..key_cells.len() { + let distinct = keys.iter().any(|k| { + let v = key_cells.fields().get(k).unwrap(); + typed_field_data_go!( + v, + ref cells, + cells[i].bits_ne(&cells[icmp]) + ) + }); + if distinct { + icmp = i; + count += 1; + } + } + + count + } + + /// Returns a subset of the records using the bitmap to determine which are included + pub fn filter(&self, set: &VarBitSet) -> Cells { + Self::new( + self.fields() + .iter() + .map(|(k, v)| (k.clone(), v.filter(set))) + .collect::>(), + ) + } + + /// Returns a subset of `self` containing only cells which have distinct values in `keys` + /// such that `self.dedup(keys).count_distinct(keys) == self.len()`. + /// The order of cells in the input is preserved and the + /// first cell for each value of `keys` is preserved in the output. + pub fn dedup(&self, keys: &[String]) -> Cells { + if self.is_empty() { + return self.clone(); + } + + let mut idx = (0..self.len()).collect::>(); + + let idx_comparator = self.index_comparator(keys); + idx.sort_by(idx_comparator); + + let mut icmp = 0; + let mut preserve = VarBitSet::new_bitset(idx.len()); + preserve.set(idx[0]); + + for i in 1..idx.len() { + let distinct = keys.iter().any(|k| { + let v = self.fields.get(k).unwrap(); + typed_field_data_go!( + v, + ref field_cells, + field_cells[idx[i]].bits_ne(&field_cells[idx[icmp]]) + ) + }); + if distinct { + icmp = i; + preserve.set(idx[i]); + } + } + + self.filter(&preserve) + } + + /// Returns a copy of `self` with only the fields in `fields`, + /// or `None` if not all the requested fields are present. + pub fn projection(&self, fields: &[&str]) -> Option { + let projection = fields + .iter() + .map(|f| { + self.fields + .get(*f) + .map(|data| (f.to_string(), data.clone())) + }) + .collect::>>()?; + Some(Cells::new(projection)) + } + + /// Adds an additional field to `self`. Returns `true` if successful, + /// i.e. the field data is valid for the current set of cells + /// and there is not already a field for the key. + pub fn add_field(&mut self, key: &str, values: FieldData) -> bool { + if self.len() != values.len() { + return false; + } + + if self.fields.contains_key(key) { + false + } else { + self.fields.insert(key.to_owned(), values); + true + } + } +} + +impl BitsEq for Cells { + fn bits_eq(&self, other: &Self) -> bool { + for (key, mine) in self.fields().iter() { + if let Some(theirs) = other.fields().get(key) { + if !mine.bits_eq(theirs) { + return false; + } + } else { + return false; + } + } + self.fields().keys().len() == other.fields().keys().len() + } +} + +pub struct StructuredCells { + dimensions: Vec, + cells: Cells, +} + +impl StructuredCells { + pub fn new(dimensions: Vec, cells: Cells) -> Self { + let expected_cells: usize = dimensions.iter().cloned().product(); + assert_eq!(expected_cells, cells.len(), "Dimensions: {:?}", dimensions); + + StructuredCells { dimensions, cells } + } + + pub fn num_dimensions(&self) -> usize { + self.dimensions.len() + } + + /// Returns the span of dimension `d` + pub fn dimension_len(&self, d: usize) -> usize { + self.dimensions[d] + } + + pub fn into_inner(self) -> Cells { + self.cells + } + + pub fn slice(&self, slices: Vec>) -> Self { + assert_eq!(slices.len(), self.dimensions.len()); // this is doable but unimportant + + struct NextIndex<'a> { + dimensions: &'a [usize], + ranges: &'a [Range], + cursors: Option>, + } + + impl<'a> NextIndex<'a> { + fn new( + dimensions: &'a [usize], + ranges: &'a [Range], + ) -> Self { + for r in ranges { + if r.is_empty() { + return NextIndex { + dimensions, + ranges, + cursors: None, + }; + } + } + + NextIndex { + dimensions, + ranges, + cursors: Some( + ranges.iter().map(|r| r.start).collect::>(), + ), + } + } + + fn compute(&self) -> usize { + let Some(cursors) = self.cursors.as_ref() else { + unreachable!() + }; + let mut index = 0; + let mut scale = 1; + for i in 0..self.dimensions.len() { + let i = self.dimensions.len() - i - 1; + index += cursors[i] * scale; + scale *= self.dimensions[i]; + } + index + } + + fn advance(&mut self) { + let Some(cursors) = self.cursors.as_mut() else { + return; + }; + for d in 0..self.dimensions.len() { + let d = self.dimensions.len() - d - 1; + if cursors[d] + 1 < self.ranges[d].end { + cursors[d] += 1; + return; + } else { + cursors[d] = self.ranges[d].start; + } + } + + // this means that we reset the final dimension + self.cursors = None; + } + } + + impl Iterator for NextIndex<'_> { + type Item = usize; + fn next(&mut self) -> Option { + if self.cursors.is_some() { + let index = self.compute(); + self.advance(); + Some(index) + } else { + None + } + } + } + + let mut v = VarBitSet::new_bitset(self.cells.len()); + + NextIndex::new(self.dimensions.as_slice(), slices.as_slice()) + .for_each(|idx| v.set(idx)); + + StructuredCells { + dimensions: self.dimensions.clone(), + cells: self.cells.filter(&v), + } + } +} + +#[derive(Clone, Debug)] +pub struct CellsView<'a> { + cells: &'a Cells, + keys: &'a [String], + slice: Range, +} + +impl<'b> PartialEq> for CellsView<'_> { + fn eq(&self, other: &CellsView<'b>) -> bool { + // must have same number of values + if self.slice.len() != other.slice.len() { + return false; + } + + for key in self.keys.iter() { + let Some(mine) = self.cells.fields.get(key) else { + // validated on construction + unreachable!() + }; + let Some(theirs) = other.cells.fields.get(key) else { + return false; + }; + + typed_field_data_cmp!( + mine, + theirs, + _DT, + ref mine, + ref theirs, + if mine[self.slice.clone()] != theirs[other.slice.clone()] { + return false; + }, + return false + ); + } + + self.keys.len() == other.keys.len() + } +} + +#[cfg(test)] +mod tests { + use std::collections::HashSet; + use std::rc::Rc; + + use proptest::prelude::*; + use tiledb_common::datatype::physical::BitsKeyAdapter; + use tiledb_pod::array::schema::SchemaData; + + use super::*; + use crate::strategy::{CellsParameters, CellsStrategySchema}; + + fn do_cells_extend(dst: Cells, src: Cells) { + let orig_dst = dst.clone(); + let orig_src = src.clone(); + + let mut dst = dst; + dst.extend(src); + + for (fname, data) in dst.fields().iter() { + let orig_dst = orig_dst.fields().get(fname).unwrap(); + let orig_src = orig_src.fields().get(fname).unwrap(); + + typed_field_data_go!(data, ref dst, { + assert_eq!( + *orig_dst, + FieldData::from(dst[0..orig_dst.len()].to_vec()) + ); + assert_eq!( + *orig_src, + FieldData::from(dst[orig_dst.len()..dst.len()].to_vec()) + ); + assert_eq!(dst.len(), orig_dst.len() + orig_src.len()); + }); + } + + // all Cells involved should have same set of fields + assert_eq!(orig_dst.fields.len(), dst.fields.len()); + assert_eq!(orig_src.fields.len(), dst.fields.len()); + } + + fn do_cells_sort(cells: Cells, keys: Vec) { + let cells_sorted = cells.sorted(keys.as_slice()); + assert!(cells_sorted.is_sorted(keys.as_slice())); + + assert_eq!(cells.fields().len(), cells_sorted.fields().len()); + + if cells.is_sorted(keys.as_slice()) { + // running the sort should not have changed anything + assert_eq!(cells, cells_sorted); + } + + /* + * We want to verify that the contents of the records are the + * same before and after the sort. We can precisely do that + * with a hash join, though it's definitely tricky to turn + * the columnar data into rows, or we can approximate it + * by sorting and comparing each column, which is not fully + * precise but way easier. + */ + for (fname, data) in cells.fields().iter() { + let Some(data_sorted) = cells_sorted.fields().get(fname) else { + unreachable!() + }; + + let orig_sorted = { + let mut orig = data.clone(); + orig.sort(); + orig + }; + let sorted_sorted = { + let mut sorted = data_sorted.clone(); + sorted.sort(); + sorted + }; + assert_eq!(orig_sorted, sorted_sorted); + } + } + + fn do_cells_slice_1d(cells: Cells, slice: Range) { + let cells = StructuredCells::new(vec![cells.len()], cells); + let sliced = cells.slice(vec![slice.clone()]).into_inner(); + let cells = cells.into_inner(); + + assert_eq!(cells.fields().len(), sliced.fields().len()); + + for (key, value) in cells.fields().iter() { + let Some(sliced) = sliced.fields().get(key) else { + unreachable!() + }; + assert_eq!( + value.slice(slice.start, slice.end - slice.start), + *sliced + ); + } + } + + fn do_cells_slice_2d( + cells: Cells, + d1: usize, + d2: usize, + s1: Range, + s2: Range, + ) { + let mut cells = cells; + cells.truncate(d1 * d2); + + let cells = StructuredCells::new(vec![d1, d2], cells); + let sliced = cells.slice(vec![s1.clone(), s2.clone()]).into_inner(); + let cells = cells.into_inner(); + + assert_eq!(cells.fields().len(), sliced.fields().len()); + + for (key, value) in cells.fields.iter() { + let Some(sliced) = sliced.fields().get(key) else { + unreachable!() + }; + + assert_eq!(s1.len() * s2.len(), sliced.len()); + + typed_field_data_cmp!( + value, + sliced, + _DT, + ref value_data, + ref sliced_data, + { + for r in s1.clone() { + let value_start = (r * d2) + s2.start; + let value_end = (r * d2) + s2.end; + let value_expect = &value_data[value_start..value_end]; + + let sliced_start = (r - s1.start) * s2.len(); + let sliced_end = (r + 1 - s1.start) * s2.len(); + let sliced_cmp = &sliced_data[sliced_start..sliced_end]; + + assert_eq!(value_expect, sliced_cmp); + } + }, + unreachable!() + ); + } + } + + fn do_cells_slice_3d( + cells: Cells, + d1: usize, + d2: usize, + d3: usize, + s1: Range, + s2: Range, + s3: Range, + ) { + let mut cells = cells; + cells.truncate(d1 * d2 * d3); + + let cells = StructuredCells::new(vec![d1, d2, d3], cells); + let sliced = cells + .slice(vec![s1.clone(), s2.clone(), s3.clone()]) + .into_inner(); + let cells = cells.into_inner(); + + assert_eq!(cells.fields().len(), sliced.fields().len()); + + for (key, value) in cells.fields.iter() { + let Some(sliced) = sliced.fields.get(key) else { + unreachable!() + }; + + assert_eq!(s1.len() * s2.len() * s3.len(), sliced.len()); + + typed_field_data_cmp!( + value, + sliced, + _DT, + ref value_data, + ref sliced_data, + { + for z in s1.clone() { + for y in s2.clone() { + let value_start = + (z * d2 * d3) + (y * d3) + s3.start; + let value_end = (z * d2 * d3) + (y * d3) + s3.end; + let value_expect = + &value_data[value_start..value_end]; + + let sliced_start = + ((z - s1.start) * s2.len() * s3.len()) + + ((y - s2.start) * s3.len()); + let sliced_end = + ((z - s1.start) * s2.len() * s3.len()) + + ((y + 1 - s2.start) * s3.len()); + let sliced_cmp = + &sliced_data[sliced_start..sliced_end]; + + assert_eq!(value_expect, sliced_cmp); + } + } + }, + unreachable!() + ); + } + } + + /// Assert that the output of [Cells::identify_groups] produces + /// correct output for the given `keys`. + fn do_cells_identify_groups(cells: Cells, keys: &[String]) { + let Some(actual) = cells.identify_groups(keys) else { + assert!(cells.is_empty()); + return; + }; + + for w in actual.windows(2) { + let (start, end) = (w[0], w[1]); + assert!(start < end); + } + + for w in actual.windows(2) { + let (start, end) = (w[0], w[1]); + for k in keys.iter() { + let f = cells.fields().get(k).unwrap(); + typed_field_data_go!(f, ref field_cells, { + for i in start..end { + assert!(field_cells[start].bits_eq(&field_cells[i])); + } + }) + } + if end < cells.len() { + let some_ne = keys.iter().any(|k| { + let f = cells.fields().get(k).unwrap(); + typed_field_data_go!(f, ref field_cells, { + field_cells[start].bits_ne(&field_cells[end]) + }) + }); + assert!(some_ne); + } + } + + assert_eq!(Some(cells.len()), actual.last().copied()); + } + + fn do_cells_count_distinct_1d(cells: Cells) { + for (key, field_cells) in cells.fields().iter() { + let expect_count = + typed_field_data_go!(field_cells, ref field_cells, { + let mut c = field_cells.clone(); + c.sort_by(|l, r| l.bits_cmp(r)); + c.dedup_by(|l, r| l.bits_eq(r)); + c.len() + }); + + let keys_for_distinct = vec![key.clone()]; + let actual_count = + cells.count_distinct(keys_for_distinct.as_slice()); + + assert_eq!(expect_count, actual_count); + } + } + + fn do_cells_count_distinct_2d(cells: Cells) { + let keys = cells.fields().keys().collect::>(); + + for i in 0..keys.len() { + for j in 0..keys.len() { + let expect_count = { + typed_field_data_go!( + cells.fields().get(keys[i]).unwrap(), + ref ki_cells, + { + typed_field_data_go!( + cells.fields().get(keys[j]).unwrap(), + ref kj_cells, + { + let mut unique = HashMap::new(); + + for r in 0..ki_cells.len() { + let values = match unique + .entry(BitsKeyAdapter(&ki_cells[r])) + { + Entry::Vacant(v) => { + v.insert(HashSet::new()) + } + Entry::Occupied(o) => o.into_mut(), + }; + values.insert(BitsKeyAdapter( + &kj_cells[r], + )); + } + + unique.values().flatten().count() + } + ) + } + ) + }; + + let keys_for_distinct = vec![keys[i].clone(), keys[j].clone()]; + let actual_count = + cells.count_distinct(keys_for_distinct.as_slice()); + + assert_eq!(expect_count, actual_count); + } + } + } + + fn do_cells_dedup(cells: Cells, keys: Vec) { + let dedup = cells.dedup(keys.as_slice()); + assert_eq!(dedup.len(), dedup.count_distinct(keys.as_slice())); + + // invariant check + for field in dedup.fields().values() { + assert_eq!(dedup.len(), field.len()); + } + + if dedup.is_empty() { + assert!(cells.is_empty()); + return; + } else if dedup.len() == cells.len() { + assert_eq!(cells, dedup); + return; + } + + // check that order within the original cells is preserved + assert_eq!(cells.view(&keys, 0..1), dedup.view(&keys, 0..1)); + + let mut in_cursor = 1; + let mut out_cursor = 1; + + while in_cursor < cells.len() && out_cursor < dedup.len() { + if cells.view(&keys, in_cursor..(in_cursor + 1)) + == dedup.view(&keys, out_cursor..(out_cursor + 1)) + { + out_cursor += 1; + in_cursor += 1; + } else { + in_cursor += 1; + } + } + assert_eq!(dedup.len(), out_cursor); + } + + fn do_cells_projection(cells: Cells, keys: Vec) { + let proj = cells + .projection(&keys.iter().map(|s| s.as_ref()).collect::>()) + .unwrap(); + + for key in keys.iter() { + let Some(field_in) = cells.fields().get(key) else { + unreachable!() + }; + let Some(field_out) = proj.fields().get(key) else { + unreachable!() + }; + + assert_eq!(field_in, field_out); + } + + // everything in `keys` is in the projection, there should be no other fields + assert_eq!(keys.len(), proj.fields().len()); + } + + proptest! { + #[test] + fn cells_extend((dst, src) in any::().prop_flat_map(|s| { + let params = CellsParameters { + schema: Some(CellsStrategySchema::WriteSchema(Rc::new(s))), + ..Default::default() + }; + (any_with::(params.clone()), any_with::(params.clone())) + })) { + do_cells_extend(dst, src) + } + + #[test] + fn cells_sort((cells, keys) in any::().prop_flat_map(|c| { + let keys = c.fields().keys().cloned().collect::>(); + let nkeys = keys.len(); + (Just(c), proptest::sample::subsequence(keys, 0..=nkeys).prop_shuffle()) + })) { + do_cells_sort(cells, keys) + } + + #[test] + fn cells_slice_1d((cells, bound1, bound2) in any::().prop_flat_map(|cells| { + let slice_min = 0; + let slice_max = cells.len(); + (Just(cells), + slice_min..=slice_max, + slice_min..=slice_max) + })) { + let start = std::cmp::min(bound1, bound2); + let end = std::cmp::max(bound1, bound2); + do_cells_slice_1d(cells, start.. end) + } + + #[test] + fn cells_slice_2d((cells, d1, d2, b11, b12, b21, b22) in any_with::(CellsParameters { + min_records: 1, + ..Default::default() + }).prop_flat_map(|cells| { + let ncells = cells.len(); + (Just(cells), + 1..=((ncells as f64).sqrt() as usize), + 1..=((ncells as f64).sqrt() as usize)) + .prop_flat_map(|(cells, d1, d2)| { + (Just(cells), + Just(d1), + Just(d2), + 0..=d1, + 0..=d1, + 0..=d2, + 0..=d2) + }) + })) { + let s1 = std::cmp::min(b11, b12).. std::cmp::max(b11, b12); + let s2 = std::cmp::min(b21, b22).. std::cmp::max(b21, b22); + do_cells_slice_2d(cells, d1, d2, s1, s2) + } + + #[test] + fn cells_slice_3d((cells, d1, d2, d3, b11, b12, b21, b22, b31, b32) in any_with::(CellsParameters { + min_records: 1, + ..Default::default() + }).prop_flat_map(|cells| { + let ncells = cells.len(); + (Just(cells), + 1..=((ncells as f64).cbrt() as usize), + 1..=((ncells as f64).cbrt() as usize), + 1..=((ncells as f64).cbrt() as usize)) + .prop_flat_map(|(cells, d1, d2, d3)| { + (Just(cells), + Just(d1), + Just(d2), + Just(d3), + 0..=d1, + 0..=d1, + 0..=d2, + 0..=d2, + 0..=d3, + 0..=d3) + }) + })) { + let s1 = std::cmp::min(b11, b12).. std::cmp::max(b11, b12); + let s2 = std::cmp::min(b21, b22).. std::cmp::max(b21, b22); + let s3 = std::cmp::min(b31, b32).. std::cmp::max(b31, b32); + do_cells_slice_3d(cells, d1, d2, d3, s1, s2, s3) + } + + #[test] + fn cells_identify_groups((cells, keys) in any::().prop_flat_map(|c| { + let keys = c.fields().keys().cloned().collect::>(); + let nkeys = keys.len(); + (Just(c), proptest::sample::subsequence(keys, 0..=nkeys)) + })) + { + do_cells_identify_groups(cells, &keys) + } + + #[test] + fn cells_count_distinct_1d(cells in any::()) { + do_cells_count_distinct_1d(cells) + } + + #[test] + fn cells_count_distinct_2d(cells in any::()) { + prop_assume!(cells.fields().len() >= 2); + do_cells_count_distinct_2d(cells) + } + + #[test] + fn cells_dedup((cells, keys) in any::().prop_flat_map(|c| { + let keys = c.fields().keys().cloned().collect::>(); + let nkeys = keys.len(); + (Just(c), proptest::sample::subsequence(keys, 0..=nkeys).prop_shuffle()) + })) + { + do_cells_dedup(cells, keys) + } + + #[test] + fn cells_projection((cells, keys) in any::().prop_flat_map(|c| { + let keys = c.fields().keys().cloned().collect::>(); + let nkeys = keys.len(); + (Just(c), proptest::sample::subsequence(keys, 0..=nkeys).prop_shuffle()) + })) { + do_cells_projection(cells, keys) + } + } +} diff --git a/test-utils/cells/src/strategy.rs b/test-utils/cells/src/strategy.rs new file mode 100644 index 00000000..6cbd1f97 --- /dev/null +++ b/test-utils/cells/src/strategy.rs @@ -0,0 +1,635 @@ +use std::collections::HashMap; +use std::fmt::Debug; +use std::ops::RangeInclusive; +use std::rc::Rc; + +use proptest::collection::SizeRange; +use proptest::prelude::*; +use proptest::strategy::{NewTree, ValueTree}; +use proptest::test_runner::TestRunner; +use strategy_ext::records::RecordsValueTree; +use tiledb_common::array::{ArrayType, CellValNum}; +use tiledb_common::datatype::{Datatype, PhysicalType}; +use tiledb_common::{dimension_constraints_go, physical_type_go}; +use tiledb_pod::array::schema::{FieldData as SchemaField, SchemaData}; + +use super::field::FieldData; +use super::Cells; + +trait IntegralType: Eq + Ord + PhysicalType {} + +impl IntegralType for u8 {} +impl IntegralType for u16 {} +impl IntegralType for u32 {} +impl IntegralType for u64 {} +impl IntegralType for i8 {} +impl IntegralType for i16 {} +impl IntegralType for i32 {} +impl IntegralType for i64 {} + +#[derive(Clone, Debug)] +pub enum FieldStrategyDatatype { + Datatype(Datatype, CellValNum), + SchemaField(SchemaField), +} + +#[derive(Clone, Debug)] +pub struct FieldDataParameters { + pub nrecords: SizeRange, + pub datatype: Option, + pub value_min_var_size: usize, + pub value_max_var_size: usize, +} + +impl Default for FieldDataParameters { + fn default() -> Self { + FieldDataParameters { + nrecords: (0..=1024).into(), + datatype: None, + value_min_var_size: 1, /* SC-48409 and SC-48428 workaround */ + value_max_var_size: 8, /* TODO */ + } + } +} + +trait ArbitraryFieldData: Sized { + fn arbitrary( + params: FieldDataParameters, + cell_val_num: CellValNum, + value_strat: BoxedStrategy, + ) -> BoxedStrategy; +} + +impl
ArbitraryFieldData for DT +where + DT: IntegralType, + FieldData: From> + From>>, +{ + fn arbitrary( + params: FieldDataParameters, + cell_val_num: CellValNum, + value_strat: BoxedStrategy, + ) -> BoxedStrategy { + if cell_val_num == 1u32 { + proptest::collection::vec(value_strat, params.nrecords) + .prop_map(FieldData::from) + .boxed() + } else { + let (min, max) = if cell_val_num.is_var_sized() { + (params.value_min_var_size, params.value_max_var_size) + } else { + let fixed_bound = Into::::into(cell_val_num) as usize; + (fixed_bound, fixed_bound) + }; + + let cell_strat = proptest::collection::vec(value_strat, min..=max); + + proptest::collection::vec(cell_strat, params.nrecords) + .prop_map(FieldData::from) + .boxed() + } + } +} +impl ArbitraryFieldData for f32 { + fn arbitrary( + params: FieldDataParameters, + cell_val_num: CellValNum, + value_strat: BoxedStrategy, + ) -> BoxedStrategy { + let value_strat = value_strat.prop_map(|float| float.to_bits()).boxed(); + + fn transform(v: Vec) -> Vec { + v.into_iter().map(f32::from_bits).collect::>() + } + + ::arbitrary( + params, + cell_val_num, + value_strat, + ) + .prop_map(|field_data| match field_data { + FieldData::UInt32(values) => FieldData::Float32(transform(values)), + FieldData::VecUInt32(values) => FieldData::VecFloat32( + values.into_iter().map(transform).collect::>>(), + ), + _ => unreachable!(), + }) + .boxed() + } +} + +impl ArbitraryFieldData for f64 { + fn arbitrary( + params: FieldDataParameters, + cell_val_num: CellValNum, + value_strat: BoxedStrategy, + ) -> BoxedStrategy { + let value_strat = value_strat.prop_map(|float| float.to_bits()).boxed(); + + fn transform(v: Vec) -> Vec { + v.into_iter().map(f64::from_bits).collect::>() + } + + ::arbitrary( + params, + cell_val_num, + value_strat, + ) + .prop_map(|field_data| match field_data { + FieldData::UInt64(values) => FieldData::Float64(transform(values)), + FieldData::VecUInt64(values) => FieldData::VecFloat64( + values.into_iter().map(transform).collect::>>(), + ), + _ => unreachable!(), + }) + .boxed() + } +} + +impl Arbitrary for FieldData { + type Strategy = BoxedStrategy; + type Parameters = FieldDataParameters; + + fn arbitrary_with(params: Self::Parameters) -> Self::Strategy { + match params.datatype.clone() { + Some(FieldStrategyDatatype::SchemaField( + SchemaField::Dimension(d), + )) => { + let value_strat = d.value_strategy(); + let cell_val_num = d.cell_val_num(); + + dimension_constraints_go!( + d.constraints, + DT, + ref domain, + _, + { +
::arbitrary( + params, + cell_val_num, + value_strat.try_into().unwrap(), + ) + }, + { + ::arbitrary( + params, + cell_val_num, + value_strat.try_into().unwrap(), + ) + } + ) + } + Some(FieldStrategyDatatype::SchemaField( + SchemaField::Attribute(a), + )) => { + let value_strat = a.value_strategy(); + let cell_val_num = + a.cell_val_num.unwrap_or(CellValNum::single()); + + physical_type_go!(a.datatype, DT, { +
::arbitrary( + params, + cell_val_num, + value_strat.try_into().unwrap(), + ) + }) + } + Some(FieldStrategyDatatype::Datatype(datatype, cell_val_num)) => { + physical_type_go!(datatype, DT, { + let value_strat = any::
().boxed(); +
::arbitrary( + params, + cell_val_num, + value_strat, + ) + }) + } + None => (any::(), any::()) + .prop_flat_map(move |(datatype, cell_val_num)| { + physical_type_go!(datatype, DT, { + let value_strat = any::
().boxed(); +
::arbitrary( + params.clone(), + cell_val_num, + value_strat, + ) + }) + }) + .boxed(), + } + } +} + +/// Mask for whether a field should be included in a write query. +// As of this writing, core does not support default values being filled in, +// so this construct is not terribly useful. But someday that may change +#[derive(Copy, Clone, Debug, Eq, PartialEq)] +enum FieldMask { + /// This field must appear in the write set + Include, + /// This field appears in the write set but simplification may change that + TentativelyInclude, + /// This field may appear in the write set again after complication + _TentativelyExclude, + /// This field may not appear in the write set again + Exclude, +} + +impl FieldMask { + pub fn is_included(&self) -> bool { + matches!(self, FieldMask::Include | FieldMask::TentativelyInclude) + } +} + +/// Value tree to shrink cells. +/// For a failing test which writes N records, there are 2^N possible +/// candidate subsets and we want to find the smallest one which fails the test +/// in the shortest number of iterations. +/// That would be ideal but really finding any input that's small enough +/// to be human readable sounds good enough. We divide the record space +/// into CELLS_VALUE_TREE_EXPLORE_PIECES chunks and identify which +/// of those chunks are necessary for the failure. +/// Recur until all of the chunks are necessary for failure, or there +/// is only one record. +/// +/// TODO: for var sized attributes, follow up by shrinking the values. +struct CellsValueTree { + _field_masks: HashMap, + field_data_tree: RecordsValueTree>, +} + +impl CellsValueTree { + pub fn new( + params: CellsParameters, + field_data: HashMap)>, + ) -> Self { + // sanity check + { + let mut nrecords = None; + for f in field_data.values() { + if let Some(f) = f.1.as_ref() { + if let Some(nrecords) = nrecords { + assert_eq!(nrecords, f.len()) + } else { + nrecords = Some(f.len()) + } + } + } + } + + let field_masks = field_data + .iter() + .map(|(fname, &(fmask, _))| (fname.clone(), fmask)) + .collect::>(); + let field_data = field_data + .into_iter() + .filter(|&(_, (fmask, _))| fmask.is_included()) + .map(|(fname, (_, fdata))| (fname, fdata.unwrap())) + .collect::>(); + + let field_data_tree = + RecordsValueTree::new(params.min_records, field_data); + + CellsValueTree { + _field_masks: field_masks, + field_data_tree, + } + } +} + +impl ValueTree for CellsValueTree { + type Value = Cells; + + fn current(&self) -> Self::Value { + Cells::new(self.field_data_tree.current()) + } + + fn simplify(&mut self) -> bool { + self.field_data_tree.simplify() + } + + fn complicate(&mut self) -> bool { + self.field_data_tree.complicate() + } +} + +#[derive(Clone, Debug)] +pub enum CellsStrategySchema { + /// Quick-and-dirty set of fields to write to + Fields(HashMap), + /// Schema for writing + WriteSchema(Rc), + /// Schema for reading + ReadSchema(Rc), +} + +impl CellsStrategySchema { + pub fn array_schema(&self) -> Option<&SchemaData> { + match self { + Self::WriteSchema(s) | Self::ReadSchema(s) => Some(s.as_ref()), + _ => None, + } + } + + fn new_field_tree( + &self, + runner: &mut TestRunner, + nrecords: RangeInclusive, + ) -> HashMap)> { + let field_data_parameters_base = FieldDataParameters::default(); + + match self { + Self::Fields(fields) => { + let nrecords = nrecords.new_tree(runner).unwrap().current(); + + let field_mask = fields + .iter() + .map(|(k, v)| { + (k.to_string(), (FieldMask::TentativelyInclude, v)) + }) + .collect::>(); + + field_mask + .into_iter() + .map(|(field, (mask, (datatype, cell_val_num)))| { + let field_data = if mask.is_included() { + let params = FieldDataParameters { + nrecords: (nrecords..=nrecords).into(), + datatype: Some( + FieldStrategyDatatype::Datatype( + *datatype, + *cell_val_num, + ), + ), + ..field_data_parameters_base.clone() + }; + Some( + any_with::(params) + .new_tree(runner) + .unwrap() + .current(), + ) + } else { + None + }; + (field, (mask, field_data)) + }) + .collect::)>>( + ) + } + Self::WriteSchema(schema) => { + let field_mask = { + let dimensions_mask = { + let mask = match schema.array_type { + ArrayType::Dense => { + /* dense array coordinates are handled by a subarray */ + FieldMask::Exclude + } + ArrayType::Sparse => { + /* sparse array must write coordinates */ + FieldMask::Include + } + }; + schema + .domain + .dimension + .iter() + .map(|d| (SchemaField::from(d.clone()), mask)) + .collect::>() + }; + + /* as of this writing, write queries must write to all attributes */ + let attributes_mask = schema + .attributes + .iter() + .map(|a| { + (SchemaField::from(a.clone()), FieldMask::Include) + }) + .collect::>(); + + dimensions_mask + .into_iter() + .chain(attributes_mask) + .collect::>() + }; + + if schema.array_type == ArrayType::Sparse + && !schema.allow_duplicates.unwrap_or(false) + { + // dimension coordinates must be unique, generate them first + let unique_keys = schema + .domain + .dimension + .iter() + .map(|d| d.name.clone()) + .collect::>(); + let dimension_data = schema + .domain + .dimension + .iter() + .map(|d| { + let params = FieldDataParameters { + nrecords: (*nrecords.end()..=*nrecords.end()) + .into(), + datatype: Some( + FieldStrategyDatatype::SchemaField( + SchemaField::Dimension(d.clone()), + ), + ), + ..field_data_parameters_base.clone() + }; + ( + d.name.clone(), + any_with::(params) + .new_tree(runner) + .unwrap() + .current(), + ) + }) + .collect::>(); + + let mut dedup_fields = + Cells::new(dimension_data).dedup(&unique_keys); + + // choose the number of records + let nrecords = { + /* + * TODO: not really accurate but in practice nrecords.start + * is probably zero so this is the easy lazy thing to do + */ + assert!(*nrecords.start() <= dedup_fields.len()); + + (*nrecords.start()..=dedup_fields.len()) + .new_tree(runner) + .unwrap() + .current() + }; + + field_mask.into_iter() + .map(|(field, mask)| { + let field_name = field.name().to_owned(); + let field_data = if let Some(mut dim) = dedup_fields.fields.remove(&field_name) { + assert!(field.is_dimension()); + dim.truncate(nrecords); + dim + } else { + assert!(field.is_attribute()); + let params = FieldDataParameters { + nrecords: (nrecords..=nrecords).into(), + datatype: Some(FieldStrategyDatatype::SchemaField(field)), + ..field_data_parameters_base.clone() + }; + any_with::(params) + .new_tree(runner) + .unwrap() + .current() + }; + assert_eq!(nrecords, field_data.len()); + (field_name, (mask, Some(field_data))) + }) + .collect::)>>() + } else { + let nrecords = nrecords.new_tree(runner).unwrap().current(); + field_mask + .into_iter() + .map(|(field, mask)| { + let field_name = field.name().to_string(); + let field_data = if mask.is_included() { + let params = FieldDataParameters { + nrecords: (nrecords..=nrecords).into(), + datatype: Some( + FieldStrategyDatatype::SchemaField(field), + ), + ..field_data_parameters_base.clone() + }; + Some( + any_with::(params) + .new_tree(runner) + .unwrap() + .current(), + ) + } else { + None + }; + (field_name, (mask, field_data)) + }) + .collect::)>>( + ) + } + } + Self::ReadSchema(_) => { + /* presumably any subset of the fields */ + unimplemented!() + } + } + } +} + +#[derive(Clone, Debug)] +pub struct CellsParameters { + pub schema: Option, + pub min_records: usize, + pub max_records: usize, + pub cell_min_var_size: usize, + pub cell_max_var_size: usize, +} + +impl CellsParameters { + pub fn min_records_default() -> usize { + **tiledb_proptest_config::TILEDB_STRATEGY_CELLS_PARAMETERS_NUM_RECORDS_MIN + } + + pub fn max_records_default() -> usize { + **tiledb_proptest_config::TILEDB_STRATEGY_CELLS_PARAMETERS_NUM_RECORDS_MAX + } + + pub fn cell_min_var_size_default() -> usize { + **tiledb_proptest_config::TILEDB_STRATEGY_CELLS_PARAMETERS_CELL_VAR_SIZE_MIN + } + + pub fn cell_max_var_size_default() -> usize { + **tiledb_proptest_config::TILEDB_STRATEGY_CELLS_PARAMETERS_CELL_VAR_SIZE_MAX + } +} + +impl Default for CellsParameters { + fn default() -> Self { + CellsParameters { + schema: None, + min_records: Self::min_records_default(), + max_records: Self::max_records_default(), + cell_min_var_size: Self::cell_min_var_size_default(), + cell_max_var_size: Self::cell_max_var_size_default(), + } + } +} + +#[derive(Debug)] +struct CellsStrategy { + schema: CellsStrategySchema, + params: CellsParameters, +} + +impl CellsStrategy { + pub fn new(schema: CellsStrategySchema, params: CellsParameters) -> Self { + CellsStrategy { schema, params } + } + + /// Returns an upper bound on the number of cells which can possibly be produced + fn nrecords_limit(&self) -> Option { + if let Some(schema) = self.schema.array_schema() { + if !schema.allow_duplicates.unwrap_or(true) { + return schema.domain.num_cells(); + } + } + None + } +} + +impl Strategy for CellsStrategy { + type Tree = CellsValueTree; + type Value = Cells; + + fn new_tree(&self, runner: &mut TestRunner) -> NewTree { + /* Choose the maximum number of records */ + let strat_nrecords = if let Some(limit) = self.nrecords_limit() { + if limit < self.params.min_records { + let r = format!("Schema and parameters are not satisfiable: schema.domain.num_cells() = {}, self.params.min_records = {}", limit, self.params.min_records); + return Err(proptest::test_runner::Reason::from(r)); + } else { + let max_records = std::cmp::min(self.params.max_records, limit); + self.params.min_records..=max_records + } + } else { + self.params.min_records..=self.params.max_records + }; + + /* generate an initial set of fields to write */ + let field_tree = self.schema.new_field_tree(runner, strat_nrecords); + + Ok(CellsValueTree::new(self.params.clone(), field_tree)) + } +} + +impl Arbitrary for Cells { + type Parameters = CellsParameters; + type Strategy = BoxedStrategy; + + fn arbitrary_with(mut args: Self::Parameters) -> Self::Strategy { + if let Some(schema) = args.schema.take() { + CellsStrategy::new(schema, args).boxed() + } else { + let keys = + tiledb_pod::array::attribute::strategy::prop_attribute_name(); + let values = (any::(), any::()); + proptest::collection::hash_map(keys, values, 1..16) + .prop_flat_map(move |values| { + CellsStrategy::new( + CellsStrategySchema::Fields(values), + args.clone(), + ) + }) + .boxed() + } + } +} diff --git a/test-utils/cells/src/write/mod.rs b/test-utils/cells/src/write/mod.rs new file mode 100644 index 00000000..d059f65d --- /dev/null +++ b/test-utils/cells/src/write/mod.rs @@ -0,0 +1,399 @@ +#[cfg(feature = "proptest-strategies")] +pub mod strategy; + +use std::ops::Deref; + +use tiledb_common::array::{CellOrder, CellValNum}; +use tiledb_common::datatype::physical::BitsOrd; +use tiledb_common::range::{NonEmptyDomain, Range, SingleValueRange}; +use tiledb_pod::array::schema::SchemaData; + +use crate::{typed_field_data_go, Cells}; + +#[derive(Clone, Debug)] +pub struct DenseWriteInput { + pub layout: CellOrder, + pub data: Cells, + pub subarray: Vec, +} + +#[derive(Clone, Debug)] +pub struct SparseWriteInput { + pub dimensions: Vec<(String, CellValNum)>, + pub data: Cells, +} + +impl SparseWriteInput { + pub fn from_schema_and_data(schema: &SchemaData, data: Cells) -> Self { + let dimensions = schema + .domain + .dimension + .iter() + .map(|d| (d.name.clone(), d.cell_val_num())) + .collect::>(); + SparseWriteInput { dimensions, data } + } + + /// Returns the minimum bounding rectangle containing all + /// the coordinates of this write operation. + pub fn domain(&self) -> Option { + self.dimensions + .iter() + .map(|(dim, cell_val_num)| { + let dim_cells = self.data.fields().get(dim).unwrap(); + Some(typed_field_data_go!( + dim_cells, + _DT, + ref dim_cells, + { + let min = + *dim_cells.iter().min_by(|l, r| l.bits_cmp(r))?; + let max = + *dim_cells.iter().max_by(|l, r| l.bits_cmp(r))?; + Range::from(&[min, max]) + }, + { + let min = dim_cells + .iter() + .min_by(|l, r| l.bits_cmp(r))? + .clone() + .into_boxed_slice(); + let max = dim_cells + .iter() + .max_by(|l, r| l.bits_cmp(r))? + .clone() + .into_boxed_slice(); + match cell_val_num { + CellValNum::Fixed(_) => { + Range::try_from((*cell_val_num, min, max)) + .unwrap() + } + CellValNum::Var => Range::from((min, max)), + } + } + )) + }) + .collect::>() + } + + /// Sort the data cells using the dimensions as sort keys, in order. + pub fn sort_cells(&mut self) { + let keys = self + .dimensions + .iter() + .map(|(k, _)| k.clone()) + .collect::>(); + self.data.sort(&keys) + } +} + +#[derive(Debug)] +pub struct DenseWriteSequence { + pub writes: Vec, +} + +impl DenseWriteSequence { + pub fn iter_mut(&mut self) -> impl Iterator { + self.writes.iter_mut() + } +} + +impl Deref for DenseWriteSequence { + type Target = Vec; + fn deref(&self) -> &Self::Target { + &self.writes + } +} + +impl From for DenseWriteSequence +where + T: Into>, +{ + fn from(value: T) -> Self { + DenseWriteSequence { + writes: value.into(), + } + } +} + +impl IntoIterator for DenseWriteSequence { + type Item = DenseWriteInput; + type IntoIter = as IntoIterator>::IntoIter; + + fn into_iter(self) -> Self::IntoIter { + self.writes.into_iter() + } +} + +impl FromIterator for DenseWriteSequence { + fn from_iter(iter: T) -> Self + where + T: IntoIterator, + { + DenseWriteSequence { + writes: iter.into_iter().collect::>(), + } + } +} + +#[derive(Debug)] +pub struct SparseWriteSequence { + pub writes: Vec, +} + +impl SparseWriteSequence { + pub fn iter_mut(&mut self) -> impl Iterator { + self.writes.iter_mut() + } +} + +impl Deref for SparseWriteSequence { + type Target = Vec; + fn deref(&self) -> &Self::Target { + &self.writes + } +} + +impl From for SparseWriteSequence +where + T: Into>, +{ + fn from(value: T) -> Self { + SparseWriteSequence { + writes: value.into(), + } + } +} + +impl IntoIterator for SparseWriteSequence { + type Item = SparseWriteInput; + type IntoIter = as IntoIterator>::IntoIter; + + fn into_iter(self) -> Self::IntoIter { + self.writes.into_iter() + } +} + +impl FromIterator for SparseWriteSequence { + fn from_iter(iter: T) -> Self + where + T: IntoIterator, + { + SparseWriteSequence { + writes: iter.into_iter().collect::>(), + } + } +} + +#[derive(Clone, Debug)] +pub enum WriteInput { + Dense(DenseWriteInput), + Sparse(SparseWriteInput), +} + +impl WriteInput { + /// Returns a reference to the cells of input of this write operation. + pub fn cells(&self) -> &Cells { + match self { + Self::Dense(ref dense) => &dense.data, + Self::Sparse(ref sparse) => &sparse.data, + } + } + + /// Returns a mutable reference to the cells of input of this write operation. + pub fn cells_mut(&mut self) -> &mut Cells { + match self { + Self::Dense(ref mut dense) => &mut dense.data, + Self::Sparse(ref mut sparse) => &mut sparse.data, + } + } + + /// Returns the minimum bounding rectangle containing + /// the coordinates of this write operation. + pub fn domain(&self) -> Option { + match self { + Self::Dense(ref dense) => Some( + dense + .subarray + .clone() + .into_iter() + .map(Range::from) + .collect::(), + ), + Self::Sparse(ref sparse) => sparse.domain(), + } + } + + /// Returns the subarray for this write operation, + /// if it is a dense write. Returns `None` otherwise. + pub fn subarray(&self) -> Option { + if let Self::Dense(_) = self { + self.domain() + } else { + None + } + } + + /// Consumes `self` and returns the underlying test data. + pub fn unwrap_cells(self) -> Cells { + match self { + Self::Dense(dense) => dense.data, + Self::Sparse(sparse) => sparse.data, + } + } +} + +pub enum WriteInputRef<'a> { + Dense(&'a DenseWriteInput), + Sparse(&'a SparseWriteInput), +} + +impl WriteInputRef<'_> { + /// Returns a reference to the cells of input of this write operation. + pub fn cells(&self) -> &Cells { + match self { + Self::Dense(dense) => &dense.data, + Self::Sparse(sparse) => &sparse.data, + } + } + + pub fn cloned(&self) -> WriteInput { + match self { + Self::Dense(dense) => WriteInput::Dense((*dense).clone()), + Self::Sparse(sparse) => WriteInput::Sparse((*sparse).clone()), + } + } + + /// Returns the minimum bounding rectangle containing + /// the coordinates of this write operation. + pub fn domain(&self) -> Option { + match self { + Self::Dense(dense) => Some( + dense + .subarray + .clone() + .into_iter() + .map(Range::from) + .collect::(), + ), + Self::Sparse(sparse) => sparse.domain(), + } + } + + /// Returns the subarray for this write operation, + /// if it is a dense write. Returns `None` otherwise. + pub fn subarray(&self) -> Option { + if let Self::Dense(_) = self { + self.domain() + } else { + None + } + } +} + +#[derive(Debug)] +pub enum WriteSequence { + Dense(DenseWriteSequence), + Sparse(SparseWriteSequence), +} + +impl WriteSequence { + pub fn iter(&self) -> WriteSequenceRefIter { + self.into_iter() + } +} + +impl From for WriteSequence { + fn from(value: WriteInput) -> Self { + match value { + WriteInput::Dense(dense) => Self::Dense(DenseWriteSequence { + writes: vec![dense], + }), + WriteInput::Sparse(sparse) => Self::Sparse(SparseWriteSequence { + writes: vec![sparse], + }), + } + } +} + +impl IntoIterator for WriteSequence { + type Item = WriteInput; + type IntoIter = WriteSequenceIter; + + fn into_iter(self) -> Self::IntoIter { + match self { + Self::Dense(dense) => WriteSequenceIter::Dense(dense.into_iter()), + Self::Sparse(sparse) => { + WriteSequenceIter::Sparse(sparse.into_iter()) + } + } + } +} + +impl<'a> IntoIterator for &'a WriteSequence { + type Item = WriteInputRef<'a>; + type IntoIter = WriteSequenceRefIter<'a>; + + fn into_iter(self) -> Self::IntoIter { + match *self { + WriteSequence::Dense(ref dense) => { + WriteSequenceRefIter::Dense(dense.iter()) + } + WriteSequence::Sparse(ref sparse) => { + WriteSequenceRefIter::Sparse(sparse.iter()) + } + } + } +} + +pub enum WriteSequenceIter { + Dense(::IntoIter), + Sparse(::IntoIter), +} + +impl Iterator for WriteSequenceIter { + type Item = WriteInput; + + fn next(&mut self) -> Option { + match self { + Self::Dense(ref mut dense) => dense.next().map(WriteInput::Dense), + Self::Sparse(ref mut sparse) => { + sparse.next().map(WriteInput::Sparse) + } + } + } + + fn size_hint(&self) -> (usize, Option) { + match self { + Self::Dense(ref d) => d.size_hint(), + Self::Sparse(ref s) => s.size_hint(), + } + } +} + +pub enum WriteSequenceRefIter<'a> { + Dense(<&'a Vec as IntoIterator>::IntoIter), + Sparse(<&'a Vec as IntoIterator>::IntoIter), +} + +impl<'a> Iterator for WriteSequenceRefIter<'a> { + type Item = WriteInputRef<'a>; + + fn next(&mut self) -> Option { + match self { + Self::Dense(ref mut dense) => { + dense.next().map(WriteInputRef::Dense) + } + Self::Sparse(ref mut sparse) => { + sparse.next().map(WriteInputRef::Sparse) + } + } + } + + fn size_hint(&self) -> (usize, Option) { + match self { + Self::Dense(ref d) => d.size_hint(), + Self::Sparse(ref s) => s.size_hint(), + } + } +} diff --git a/test-utils/cells/src/write/strategy.rs b/test-utils/cells/src/write/strategy.rs new file mode 100644 index 00000000..a5e9a592 --- /dev/null +++ b/test-utils/cells/src/write/strategy.rs @@ -0,0 +1,624 @@ +use std::fmt::{Debug, Formatter, Result as FmtResult}; +use std::ops::RangeInclusive; +use std::rc::Rc; + +use proptest::prelude::*; +use proptest::strategy::{NewTree, ValueTree}; +use proptest::test_runner::TestRunner; + +use tiledb_common::array::{ArrayType, CellOrder, CellValNum}; +use tiledb_common::range::{Range, SingleValueRange}; +use tiledb_common::single_value_range_go; +use tiledb_pod::array::schema::SchemaData; + +use super::*; +use crate::strategy::{ + CellsParameters, CellsStrategySchema, FieldDataParameters, +}; +use crate::{Cells, StructuredCells}; + +type BoxedValueTree = Box>; + +#[derive(Clone, Debug)] +pub struct DenseWriteParameters { + pub schema: Option>, + pub layout: Option, + pub memory_limit: usize, +} + +impl DenseWriteParameters { + pub fn memory_limit_default() -> usize { + **tiledb_proptest_config::TILEDB_STRATEGY_DENSE_WRITE_PARAMETERS_MEMORY_LIMIT + } +} + +impl Default for DenseWriteParameters { + fn default() -> Self { + DenseWriteParameters { + schema: Default::default(), + layout: Default::default(), + memory_limit: Self::memory_limit_default(), + } + } +} + +pub struct DenseWriteValueTree { + layout: CellOrder, + field_order: Vec, + bounding_subarray: Vec>, + subarray: Vec>, + cells: StructuredCells, + prev_shrink: Option, +} + +impl DenseWriteValueTree { + pub fn new( + layout: CellOrder, + bounding_subarray: Vec, + subarray: Vec>, + cells: Cells, + ) -> Self { + let field_order = + cells.fields().keys().cloned().collect::>(); + + let cells = { + let dimension_len = bounding_subarray + .iter() + .map(|r| { + usize::try_from(r.num_cells().unwrap()) + .expect("Too many cells to fit in memory") + }) + .collect::>(); + StructuredCells::new(dimension_len, cells) + }; + + let bounding_subarray = bounding_subarray + .into_iter() + .map(|range| { + let r = RangeInclusive::::try_from(range).unwrap(); + assert!(r.start() <= r.end()); + r + }) + .collect::>>(); + + DenseWriteValueTree { + layout, + field_order, + bounding_subarray, + subarray, + cells, + prev_shrink: None, + } + } + + fn subarray_current(&self) -> Vec { + self.subarray + .iter() + .map(|tree| tree.current()) + .collect::>() + } + + fn cells_for_subarray( + &self, + subarray: &[SingleValueRange], + ) -> StructuredCells { + let slices = self + .bounding_subarray + .iter() + .zip(subarray.iter()) + .map(|(complete, current)| { + let current = + RangeInclusive::::try_from(current.clone()).unwrap(); + + assert!(current.start() <= current.end()); + + assert!( + complete.start() <= current.start(), + "complete = {:?}, current = {:?}", + complete, + current + ); + assert!( + current.end() <= complete.end(), + "complete = {:?}, current = {:?}", + complete, + current + ); + + let start = current.start() - complete.start(); + let end = current.end() - complete.start() + 1; + let ustart = usize::try_from(start) + .expect("Current range is narrower than bounding range"); + let uend = usize::try_from(end) + .expect("Current range is narrower than bounding range"); + ustart..uend + }) + .collect::>>(); + + self.cells.slice(slices) + } +} + +impl Debug for DenseWriteValueTree { + fn fmt(&self, f: &mut Formatter) -> FmtResult { + f.debug_struct("DenseWriteValueTree") + .field("layout", &self.layout) + .field("field_order", &self.field_order) + .field("bounding_subarray", &self.bounding_subarray) + .field("subarray", &self.subarray_current()) + .field("prev_shrink", &self.prev_shrink) + .finish() + } +} + +impl ValueTree for DenseWriteValueTree { + type Value = DenseWriteInput; + + fn current(&self) -> Self::Value { + let subarray = self.subarray_current(); + let cells = self.cells_for_subarray(&subarray); + + DenseWriteInput { + layout: self.layout, + data: cells.into_inner(), + subarray, + } + } + + fn simplify(&mut self) -> bool { + // try shrinking each dimension in round-robin order, + // beginning with the dimension after whichever we + // previously shrunk + let start = self.prev_shrink.map(|d| d + 1).unwrap_or(0); + + for i in 0..self.subarray.len() { + let idx = (start + i) % self.subarray.len(); + if self.subarray[idx].simplify() { + self.prev_shrink = Some(idx); + return true; + } + } + + self.prev_shrink = None; + false + } + + fn complicate(&mut self) -> bool { + // complicate whichever dimension we previously simplified + if let Some(d) = self.prev_shrink { + if self.subarray[d].complicate() { + // we may be able to complicate again, keep prev_shrink + true + } else { + self.prev_shrink = None; + false + } + } else { + false + } + } +} + +#[derive(Debug)] +pub struct DenseWriteStrategy { + schema: Rc, + layout: CellOrder, + params: DenseWriteParameters, +} + +impl DenseWriteStrategy { + pub fn new( + schema: Rc, + layout: CellOrder, + params: DenseWriteParameters, + ) -> Self { + DenseWriteStrategy { + schema, + layout, + params, + } + } +} + +impl Strategy for DenseWriteStrategy { + type Tree = DenseWriteValueTree; + type Value = DenseWriteInput; + + fn new_tree(&self, runner: &mut TestRunner) -> NewTree { + /* + * For simplicity, we will bound the memory used at each dimension + * rather than keeping a moving product of the accumulated memory + */ + let memory_limit = + { self.params.memory_limit / self.schema.domain.dimension.len() }; + + if matches!(self.layout, CellOrder::Global) { + // necessary to align to tile boundaries + unimplemented!() + } + + let est_cell_size: usize = self + .schema + .fields() + .map(|field| { + match field.cell_val_num().unwrap_or(CellValNum::single()) { + CellValNum::Fixed(nz) => { + /* exact */ + nz.get() as usize * field.datatype().size() + } + CellValNum::Var => { + /* estimate */ + let params = + ::default(); + let est_nvalues = (params.value_min_var_size + + params.value_max_var_size) + / 2; + est_nvalues * field.datatype().size() + } + } + }) + .sum(); + + let cell_limit: usize = memory_limit / est_cell_size; + + /* choose maximal subarray for the write, we will shrink within this window */ + let strat_subarray_bounds = self + .schema + .domain + .dimension + .iter() + .map(|d| { + d.subarray_strategy(Some(cell_limit)).expect("Dense dimension subarray not found") + .prop_map(|r| { + let Range::Single(s) = r else { + unreachable!("Dense dimension subarray is not `Range::Single`: {:?}", r) + }; + s + }).boxed() + }) + .collect::>>(); + + let bounding_subarray = strat_subarray_bounds + .into_iter() + .map(|strat| strat.new_tree(runner).unwrap().current()) + .collect::>(); + + /* prepare tree for each subarray dimension */ + let strat_subarray = bounding_subarray + .iter() + .cloned() + .map(|dim| { + single_value_range_go!( + dim, + _DT: Integral, + start, + end, + { + (start..=end) + .prop_flat_map(move |lower| { + (Just(lower), lower..=end).prop_map( + move |(lower, upper)| { + SingleValueRange::from(&[lower, upper]) + }, + ) + }) + .boxed() + }, + unreachable!() + ) + }) + .collect::>>(); + + let mut subarray: Vec> = vec![]; + for range in strat_subarray { + subarray.push(range.new_tree(runner).unwrap()); + } + + let cells = { + let ncells = bounding_subarray + .iter() + .map(|range| { + usize::try_from(range.num_cells().unwrap()) + .expect("Too many cells to fit in memory") + }) + .product(); + assert!(ncells > 0); + let params = CellsParameters { + schema: Some(CellsStrategySchema::WriteSchema(Rc::clone( + &self.schema, + ))), + min_records: ncells, + max_records: ncells, + ..Default::default() + }; + any_with::(params).new_tree(runner)?.current() + }; + + Ok(DenseWriteValueTree::new( + self.layout, + bounding_subarray, + subarray, + cells, + )) + } +} + +impl Arbitrary for DenseWriteInput { + type Parameters = DenseWriteParameters; + type Strategy = BoxedStrategy; + + fn arbitrary_with(args: Self::Parameters) -> Self::Strategy { + let mut args = args; + let strat_schema = match args.schema.take() { + None => any::().prop_map(Rc::new).boxed(), + Some(schema) => Just(schema).boxed(), + }; + let strat_layout = match args.layout.take() { + None => prop_oneof![ + Just(CellOrder::RowMajor), + Just(CellOrder::ColumnMajor), + /* TODO: CellOrder::Global is possible but has more constraints */ + ].boxed(), + Some(layout) => Just(layout).boxed() + }; + + (strat_schema, strat_layout) + .prop_flat_map(move |(schema, layout)| { + DenseWriteStrategy::new(schema, layout, args.clone()) + }) + .boxed() + } +} + +pub type SparseWriteParameters = DenseWriteParameters; // TODO: determine if this should be different + +impl Arbitrary for SparseWriteInput { + type Parameters = SparseWriteParameters; + type Strategy = BoxedStrategy; + + fn arbitrary_with(params: Self::Parameters) -> Self::Strategy { + if let Some(schema) = params.schema.as_ref() { + let schema = Rc::clone(schema); + let cells_params = CellsParameters { + schema: Some(CellsStrategySchema::WriteSchema(Rc::clone( + &schema, + ))), + ..Default::default() + }; + any_with::(cells_params) + .prop_map(move |data| { + let dimensions = schema + .domain + .dimension + .iter() + .map(|d| (d.name.clone(), d.cell_val_num())) + .collect::>(); + SparseWriteInput { dimensions, data } + }) + .boxed() + } else { + any::() + .prop_flat_map(|data| { + (0..data.fields().len(), Just(data)).prop_map( + |(ndim, data)| SparseWriteInput { + dimensions: data + .fields() + .iter() + .take(ndim) + .map(|(fname, fdata)| { + ( + fname.clone(), + if fdata.is_cell_single() { + CellValNum::single() + } else { + CellValNum::Var + }, + ) + }) + .collect::>(), + data, + }, + ) + }) + .boxed() + } + } +} + +impl Arbitrary for DenseWriteSequence { + type Parameters = DenseWriteSequenceParameters; + type Strategy = BoxedStrategy; + + fn arbitrary_with(params: Self::Parameters) -> Self::Strategy { + fn prop_write_sequence( + schema: Rc, + seq_params: DenseWriteSequenceParameters, + ) -> BoxedStrategy { + let write_params = DenseWriteParameters { + schema: Some(schema), + ..seq_params.write.as_ref().clone() + }; + proptest::collection::vec( + any_with::(write_params), + seq_params.min_writes..=seq_params.max_writes, + ) + .prop_map(|writes| DenseWriteSequence { writes }) + .boxed() + } + + if let Some(schema) = params.write.schema.as_ref() { + prop_write_sequence(Rc::clone(schema), params) + } else { + any::() + .prop_flat_map(move |schema| { + prop_write_sequence(Rc::new(schema), params.clone()) + }) + .boxed() + } + } +} + +impl Arbitrary for SparseWriteSequence { + type Parameters = SparseWriteSequenceParameters; + type Strategy = BoxedStrategy; + + fn arbitrary_with(params: Self::Parameters) -> Self::Strategy { + pub fn prop_write_sequence( + schema: Rc, + seq_params: SparseWriteSequenceParameters, + ) -> impl Strategy { + let write_params = SparseWriteParameters { + schema: Some(schema), + ..seq_params.write.as_ref().clone() + }; + proptest::collection::vec( + any_with::(write_params), + seq_params.min_writes..=seq_params.max_writes, + ) + .prop_map(|writes| SparseWriteSequence { writes }) + } + + if let Some(schema) = params.write.schema.as_ref() { + prop_write_sequence(Rc::clone(schema), params).boxed() + } else { + any::() + .prop_flat_map(move |schema| { + prop_write_sequence(Rc::new(schema), params.clone()) + }) + .boxed() + } + } +} + +#[derive(Debug)] +pub enum WriteParameters { + Dense(DenseWriteParameters), + Sparse(SparseWriteParameters), +} + +impl WriteParameters { + pub fn default_for(schema: Rc) -> Self { + match schema.array_type { + ArrayType::Dense => Self::Dense(DenseWriteParameters { + schema: Some(schema), + ..Default::default() + }), + ArrayType::Sparse => Self::Sparse(SparseWriteParameters { + schema: Some(schema), + ..Default::default() + }), + } + } +} + +impl Default for WriteParameters { + fn default() -> Self { + Self::Dense(DenseWriteParameters::default()) + } +} + +impl Arbitrary for WriteInput { + type Parameters = WriteParameters; + type Strategy = BoxedStrategy; + + fn arbitrary_with(params: Self::Parameters) -> Self::Strategy { + match params { + WriteParameters::Dense(d) => any_with::(d) + .prop_map(WriteInput::Dense) + .boxed(), + WriteParameters::Sparse(s) => any_with::(s) + .prop_map(WriteInput::Sparse) + .boxed(), + } + } +} + +#[derive(Clone, Debug)] +pub struct WriteSequenceParametersImpl { + pub write: Rc, + pub min_writes: usize, + pub max_writes: usize, +} + +pub type DenseWriteSequenceParameters = + WriteSequenceParametersImpl; +pub type SparseWriteSequenceParameters = + WriteSequenceParametersImpl; + +impl WriteSequenceParametersImpl { + pub fn min_writes_default() -> usize { + **tiledb_proptest_config::TILEDB_STRATEGY_WRITE_SEQUENCE_PARAMETERS_MIN_WRITES + } + + pub fn max_writes_default() -> usize { + **tiledb_proptest_config::TILEDB_STRATEGY_WRITE_SEQUENCE_PARAMETERS_MAX_WRITES + } +} + +impl Default for WriteSequenceParametersImpl +where + W: Default, +{ + fn default() -> Self { + WriteSequenceParametersImpl { + write: Rc::new(Default::default()), + min_writes: Self::min_writes_default(), + max_writes: Self::max_writes_default(), + } + } +} + +#[derive(Debug)] +pub enum WriteSequenceParameters { + Dense(DenseWriteSequenceParameters), + Sparse(SparseWriteSequenceParameters), +} + +impl WriteSequenceParameters { + pub fn default_for(schema: Rc) -> Self { + match schema.array_type { + ArrayType::Dense => Self::Dense(DenseWriteSequenceParameters { + write: Rc::new(DenseWriteParameters { + schema: Some(schema), + ..Default::default() + }), + min_writes: DenseWriteSequenceParameters::min_writes_default(), + max_writes: DenseWriteSequenceParameters::max_writes_default(), + }), + ArrayType::Sparse => Self::Sparse(SparseWriteSequenceParameters { + write: Rc::new(SparseWriteParameters { + schema: Some(schema), + ..Default::default() + }), + min_writes: SparseWriteSequenceParameters::min_writes_default(), + max_writes: SparseWriteSequenceParameters::max_writes_default(), + }), + } + } +} + +impl Default for WriteSequenceParameters { + fn default() -> Self { + Self::Dense(Default::default()) + } +} + +impl Arbitrary for WriteSequence { + type Parameters = WriteSequenceParameters; + type Strategy = BoxedStrategy; + + fn arbitrary_with(params: Self::Parameters) -> Self::Strategy { + match params { + WriteSequenceParameters::Dense(d) => { + any_with::(d) + .prop_map(Self::Dense) + .boxed() + } + WriteSequenceParameters::Sparse(s) => { + any_with::(s) + .prop_map(Self::Sparse) + .boxed() + } + } + } +} diff --git a/test-utils/proptest-config/Cargo.toml b/test-utils/proptest-config/Cargo.toml new file mode 100644 index 00000000..ec523d68 --- /dev/null +++ b/test-utils/proptest-config/Cargo.toml @@ -0,0 +1,7 @@ +[package] +name = "tiledb-proptest-config" +edition.workspace = true +rust-version.workspace = true +version.workspace = true + +[dependencies] diff --git a/test-utils/proptest-config/src/lib.rs b/test-utils/proptest-config/src/lib.rs new file mode 100644 index 00000000..783fd99c --- /dev/null +++ b/test-utils/proptest-config/src/lib.rs @@ -0,0 +1,122 @@ +use std::ops::Deref; +use std::str::FromStr; +use std::sync::LazyLock; + +fn try_parse_env(env: &str) -> Option +where + T: FromStr, +{ + match std::env::var(env) { + Ok(value) => Some( + T::from_str(&value) + .unwrap_or_else(|_| panic!("Invalid value for {}", env)), + ), + Err(_) => None, + } +} + +/// The value of a strategy configuration parameter and its provenance. +pub enum Configuration { + Default(T), + Environmental(T), +} + +impl Configuration { + /// Converts to [Option], returning the wrapped value + /// if this is [Environmental] and [None] otherwise. + pub fn environmental(&self) -> Option + where + T: Copy, + { + match self { + Self::Default(_) => None, + Self::Environmental(value) => Some(*value), + } + } +} + +impl Deref for Configuration { + type Target = T; + + fn deref(&self) -> &Self::Target { + match self { + Self::Default(ref value) => value, + Self::Environmental(ref value) => value, + } + } +} + +macro_rules! config_param { + ($name:ident, $type:ty, $default:expr) => { + pub static $name: LazyLock> = + LazyLock::new(|| { + if let Some(value) = try_parse_env::<$type>(stringify!($name)) { + Configuration::Environmental(value) + } else { + Configuration::Default($default) + } + }); + }; +} + +// array/domain/strategy.rs +config_param!(TILEDB_STRATEGY_DOMAIN_PARAMETERS_DIMENSIONS_MIN, usize, 1); +config_param!(TILEDB_STRATEGY_DOMAIN_PARAMETERS_DIMENSIONS_MAX, usize, 8); +config_param!( + TILEDB_STRATEGY_DOMAIN_PARAMETERS_CELLS_PER_TILE_LIMIT, + usize, + 1024 * 32 +); + +// array/schema/strategy.rs +config_param!(TILEDB_STRATEGY_SCHEMA_PARAMETERS_ATTRIBUTES_MIN, usize, 1); +config_param!(TILEDB_STRATEGY_SCHEMA_PARAMETERS_ATTRIBUTES_MAX, usize, 8); +config_param!( + TILEDB_STRATEGY_SCHEMA_PARAMETERS_SPARSE_TILE_CAPACITY_MIN, + u64, + 1 +); +config_param!( + TILEDB_STRATEGY_SCHEMA_PARAMETERS_SPARSE_TILE_CAPACITY_MAX, + u64, + **TILEDB_STRATEGY_DOMAIN_PARAMETERS_CELLS_PER_TILE_LIMIT as u64 +); + +// array/enumeration/strategy.rs +config_param!( + TILEDB_STRATEGY_ENUMERATION_PARAMETERS_NUM_VARIANTS_MIN, + usize, + 1 +); +config_param!( + TILEDB_STRATEGY_ENUMERATION_PARAMETERS_NUM_VARIANTS_MAX, + usize, + 1024 +); + +// query/strategy.rs +config_param!(TILEDB_STRATEGY_CELLS_PARAMETERS_NUM_RECORDS_MIN, usize, 0); +config_param!(TILEDB_STRATEGY_CELLS_PARAMETERS_NUM_RECORDS_MAX, usize, 16); +config_param!(TILEDB_STRATEGY_CELLS_PARAMETERS_CELL_VAR_SIZE_MIN, usize, 0); +config_param!( + TILEDB_STRATEGY_CELLS_PARAMETERS_CELL_VAR_SIZE_MAX, + usize, + 16 +); + +// query/write/strategy.rs +config_param!( + TILEDB_STRATEGY_DENSE_WRITE_PARAMETERS_MEMORY_LIMIT, + usize, + 16 * 1024 // chosen arbitrarily; seems small +); +config_param!( + TILEDB_STRATEGY_WRITE_SEQUENCE_PARAMETERS_MIN_WRITES, + usize, + 1 +); +config_param!( + TILEDB_STRATEGY_WRITE_SEQUENCE_PARAMETERS_MAX_WRITES, + usize, + 8 +); diff --git a/test-utils/signal/Cargo.toml b/test-utils/signal/Cargo.toml new file mode 100644 index 00000000..48c0f259 --- /dev/null +++ b/test-utils/signal/Cargo.toml @@ -0,0 +1,8 @@ +[package] +name = "signal" +edition.workspace = true +rust-version.workspace = true +version.workspace = true + +[dependencies] +nix = { version = "0", features = ["signal"] } diff --git a/tiledb/test-utils/src/signal.rs b/test-utils/signal/src/lib.rs similarity index 100% rename from tiledb/test-utils/src/signal.rs rename to test-utils/signal/src/lib.rs diff --git a/test-utils/strategy-ext/Cargo.toml b/test-utils/strategy-ext/Cargo.toml new file mode 100644 index 00000000..b8ecb7e4 --- /dev/null +++ b/test-utils/strategy-ext/Cargo.toml @@ -0,0 +1,8 @@ +[package] +name = "strategy-ext" +edition.workspace = true +rust-version.workspace = true +version.workspace = true + +[dependencies] +proptest = { workspace = true } diff --git a/tiledb/test-utils/src/strategy/mod.rs b/test-utils/strategy-ext/src/lib.rs similarity index 100% rename from tiledb/test-utils/src/strategy/mod.rs rename to test-utils/strategy-ext/src/lib.rs diff --git a/tiledb/test-utils/src/strategy/meta.rs b/test-utils/strategy-ext/src/meta.rs similarity index 98% rename from tiledb/test-utils/src/strategy/meta.rs rename to test-utils/strategy-ext/src/meta.rs index a7977b10..f81bb34d 100644 --- a/tiledb/test-utils/src/strategy/meta.rs +++ b/test-utils/strategy-ext/src/meta.rs @@ -5,13 +5,13 @@ use proptest::prelude::*; use proptest::strategy::{NewTree, ValueTree}; use proptest::test_runner::{Config as ProptestConfig, TestRunner}; -use crate::strategy::sequence::SequenceValueTree; +use crate::sequence::SequenceValueTree; /// Strategy to create [ValueTree] objects for a wrapped [Strategy]. /// /// ``` /// # use proptest::prelude::*; -/// # use tiledb_test_utils::strategy::meta::ValueTreeStrategy; +/// # use strategy_ext::meta::ValueTreeStrategy; /// /// proptest! { /// #[test] diff --git a/tiledb/test-utils/src/strategy/records.rs b/test-utils/strategy-ext/src/records.rs similarity index 99% rename from tiledb/test-utils/src/strategy/records.rs rename to test-utils/strategy-ext/src/records.rs index 7d237a64..bea7594f 100644 --- a/tiledb/test-utils/src/strategy/records.rs +++ b/test-utils/strategy-ext/src/records.rs @@ -416,8 +416,8 @@ enum ShrinkStep { mod tests { use super::*; - use crate::strategy::meta::*; - use crate::strategy::StrategyExt; + use crate::meta::*; + use crate::StrategyExt; #[test] fn shrink_convergence_u64() { diff --git a/tiledb/test-utils/src/strategy/sequence.rs b/test-utils/strategy-ext/src/sequence.rs similarity index 96% rename from tiledb/test-utils/src/strategy/sequence.rs rename to test-utils/strategy-ext/src/sequence.rs index 99036384..96565afc 100644 --- a/tiledb/test-utils/src/strategy/sequence.rs +++ b/test-utils/strategy-ext/src/sequence.rs @@ -61,8 +61,8 @@ mod tests { use proptest::prelude::*; use super::*; - use crate::strategy::meta::{ShrinkAction, ShrinkSequenceStrategy}; - use crate::strategy::StrategyExt; + use crate::meta::{ShrinkAction, ShrinkSequenceStrategy}; + use crate::StrategyExt; proptest! { /// Ensure that the sequence strategy always returns a subsequence diff --git a/test-utils/uri/Cargo.toml b/test-utils/uri/Cargo.toml new file mode 100644 index 00000000..86519d04 --- /dev/null +++ b/test-utils/uri/Cargo.toml @@ -0,0 +1,9 @@ +[package] +name = "uri" +edition.workspace = true +rust-version.workspace = true +version.workspace = true + +[dependencies] +anyhow = { workspace = true } +tempfile = { workspace = true } diff --git a/test-utils/uri/src/lib.rs b/test-utils/uri/src/lib.rs new file mode 100644 index 00000000..c34ab878 --- /dev/null +++ b/test-utils/uri/src/lib.rs @@ -0,0 +1,22 @@ +mod tempdir; + +pub use tempdir::TestDirectory; + +use anyhow::Result; + +pub trait TestArrayUri { + fn base_dir(&self) -> Result; + fn with_paths(&self, paths: &[&str]) -> Result; + fn close(self) -> Result<()>; + + fn with_path(&self, path: &str) -> Result { + self.with_paths(&[path]) + } +} + +pub fn get_uri_generator() -> Result { + // TODO: Eventually this will check an environment variable to decide + // whether we should return a TestDirectory or a new struct called something + // like TestRestServer to run our test suite against the cloud service. + tempdir::TestDirectory::new() +} diff --git a/tiledb/test-utils/src/uri_generators.rs b/test-utils/uri/src/tempdir.rs similarity index 73% rename from tiledb/test-utils/src/uri_generators.rs rename to test-utils/uri/src/tempdir.rs index e5860fda..c919dbaa 100644 --- a/tiledb/test-utils/src/uri_generators.rs +++ b/test-utils/uri/src/tempdir.rs @@ -1,22 +1,7 @@ use anyhow::{anyhow, Result}; use tempfile::TempDir; -pub trait TestArrayUri { - fn base_dir(&self) -> Result; - fn with_paths(&self, paths: &[&str]) -> Result; - fn close(self) -> Result<()>; - - fn with_path(&self, path: &str) -> Result { - self.with_paths(&[path]) - } -} - -pub fn get_uri_generator() -> Result { - // TODO: Eventually this will check an environment variable to decide - // whether we should return a TestDirectory or a new struct called something - // like TestRestServer to run our test suite against the cloud service. - TestDirectory::new() -} +use super::TestArrayUri; pub struct TestDirectory { base_dir: TempDir, diff --git a/tiledb/api/Cargo.toml b/tiledb/api/Cargo.toml index 1d357577..a4cad8e0 100644 --- a/tiledb/api/Cargo.toml +++ b/tiledb/api/Cargo.toml @@ -1,31 +1,35 @@ [package] -name = "tiledb" +name = "tiledb-api" version = { workspace = true } edition = { workspace = true } [lib] -name = "tiledb" +name = "tiledb_api" path = "src/lib.rs" [dependencies] anyhow = { workspace = true } -arrow = { version = "52.0.0", features = ["prettyprint"], optional = true } -itertools = "0" -num-traits = { version = "0.2", optional = true } -paste = "1.0" +arrow = { workspace = true, optional = true } +cells = { workspace = true, features = ["proptest-strategies"], optional = true } +itertools = { workspace = true } +num-traits = { workspace = true, optional = true } +paste = { workspace = true } proptest = { workspace = true, optional = true } -serde = { version = "1.0.136", features = ["derive"] } -serde_json = { workspace = true } -thiserror = "1.0.58" -tiledb-proc-macro = { workspace = true } +serde = { workspace = true, optional = true } +serde_json = { workspace = true, optional = true } +thiserror = { workspace = true } +tiledb-common = { workspace = true } +tiledb-pod = { workspace = true, optional = true, features = ["serde"] } tiledb-sys = { workspace = true } -tiledb-test-utils = { workspace = true, optional = true } -tiledb-utils = { workspace = true, features = ["serde_json"] } [dev-dependencies] -num-traits = { version = "0.2" } +cells = { workspace = true, features = ["proptest-strategies"] } +num-traits = { workspace = true } proptest = { workspace = true } -tiledb-test-utils = { workspace = true } +tiledb-common = { workspace = true, features = ["option-subset"] } +tiledb-pod = { workspace = true, features = ["proptest-strategies", "option-subset", "serde"] } +tiledb-utils = { workspace = true } +uri = { workspace = true } [build-dependencies] pkg-config = { workspace = true } @@ -33,5 +37,15 @@ tiledb-utils = { workspace = true } [features] default = [] -proptest-strategies = ["dep:num-traits", "dep:proptest", "dep:tiledb-test-utils"] -arrow = ["dep:arrow"] +arrow = ["dep:arrow", "dep:serde", "dep:serde_json", "tiledb-common/arrow", "tiledb-common/serde"] +pod = ["dep:tiledb-pod"] +proptest-strategies = ["dep:cells", "dep:proptest"] +serde = ["dep:serde", "dep:serde_json", "dep:tiledb-pod"] + +[[example]] +name = "fragment_info" +required-features = ["serde"] + +[[example]] +name = "using_tiledb_stats" +required-features = ["serde"] diff --git a/tiledb/api/examples/aggregates.rs b/tiledb/api/examples/aggregates.rs index b920e258..fefd3090 100644 --- a/tiledb/api/examples/aggregates.rs +++ b/tiledb/api/examples/aggregates.rs @@ -1,4 +1,4 @@ -extern crate tiledb; +extern crate tiledb_api as tiledb; use crate::tiledb::query::read::AggregateQueryBuilder; use std::path::PathBuf; diff --git a/tiledb/api/examples/fragment_info.rs b/tiledb/api/examples/fragment_info.rs index 7c2a7737..25fc4af2 100644 --- a/tiledb/api/examples/fragment_info.rs +++ b/tiledb/api/examples/fragment_info.rs @@ -1,4 +1,4 @@ -extern crate tiledb; +extern crate tiledb_api as tiledb; use std::path::PathBuf; diff --git a/tiledb/api/examples/groups.rs b/tiledb/api/examples/groups.rs index ed0fdd94..ad676c93 100644 --- a/tiledb/api/examples/groups.rs +++ b/tiledb/api/examples/groups.rs @@ -1,4 +1,4 @@ -extern crate tiledb; +extern crate tiledb_api as tiledb; use std::path::PathBuf; diff --git a/tiledb/api/examples/multi_range_subarray.rs b/tiledb/api/examples/multi_range_subarray.rs index 574740b9..d3fead2b 100644 --- a/tiledb/api/examples/multi_range_subarray.rs +++ b/tiledb/api/examples/multi_range_subarray.rs @@ -1,19 +1,19 @@ -extern crate tiledb; +extern crate tiledb_api as tiledb; use std::path::PathBuf; use itertools::izip; use tiledb::array::{ - Array, ArrayType, AttributeData, CellOrder, DimensionData, DomainData, - SchemaData, TileOrder, + Array, ArrayType, AttributeBuilder, CellOrder, DimensionBuilder, + DomainBuilder, SchemaBuilder, TileOrder, }; use tiledb::context::Context; use tiledb::query::{ Query, QueryBuilder, ReadBuilder, ReadQuery, ReadQueryBuilder, WriteBuilder, }; +use tiledb::Datatype; use tiledb::Result as TileDBResult; -use tiledb::{Datatype, Factory}; const ARRAY_URI: &str = "multi_range_slicing"; @@ -78,36 +78,37 @@ fn main() -> TileDBResult<()> { } fn create_array(ctx: &Context) -> TileDBResult<()> { - let schema = SchemaData { - array_type: ArrayType::Dense, - domain: DomainData { - dimension: vec![ - DimensionData { - name: "rows".to_owned(), - datatype: Datatype::Int32, - constraints: ([1i32, 4], 4i32).into(), - filters: None, - }, - DimensionData { - name: "cols".to_owned(), - datatype: Datatype::Int32, - constraints: ([1i32, 4], 4i32).into(), - filters: None, - }, - ], - }, - attributes: vec![AttributeData { - name: "a".to_owned(), - datatype: Datatype::Int32, - ..Default::default() - }], - tile_order: Some(TileOrder::RowMajor), - cell_order: Some(CellOrder::RowMajor), - - ..Default::default() + let schema = { + let domain = DomainBuilder::new(ctx)? + .add_dimension( + DimensionBuilder::new( + ctx, + "rows", + Datatype::Int32, + ([1i32, 4], 4i32), + )? + .build(), + )? + .add_dimension( + DimensionBuilder::new( + ctx, + "cols", + Datatype::Int32, + ([1i32, 4], 4i32), + )? + .build(), + )? + .build(); + + SchemaBuilder::new(ctx, ArrayType::Dense, domain)? + .cell_order(CellOrder::RowMajor)? + .tile_order(TileOrder::RowMajor)? + .add_attribute( + AttributeBuilder::new(ctx, "a", Datatype::Int32)?.build(), + )? + .build()? }; - let schema = schema.create(ctx)?; Array::create(ctx, ARRAY_URI, schema)?; Ok(()) } diff --git a/tiledb/api/examples/query_condition_dense.rs b/tiledb/api/examples/query_condition_dense.rs index d37c3c24..8afbae71 100644 --- a/tiledb/api/examples/query_condition_dense.rs +++ b/tiledb/api/examples/query_condition_dense.rs @@ -1,3 +1,5 @@ +extern crate tiledb_api as tiledb; + use std::cell::RefCell; use std::path::PathBuf; diff --git a/tiledb/api/examples/query_condition_sparse.rs b/tiledb/api/examples/query_condition_sparse.rs index d656bcf5..6d300c36 100644 --- a/tiledb/api/examples/query_condition_sparse.rs +++ b/tiledb/api/examples/query_condition_sparse.rs @@ -1,3 +1,5 @@ +extern crate tiledb_api as tiledb; + use std::cell::RefCell; use std::path::PathBuf; diff --git a/tiledb/api/examples/quickstart_dense.rs b/tiledb/api/examples/quickstart_dense.rs index 719557af..f388fe28 100644 --- a/tiledb/api/examples/quickstart_dense.rs +++ b/tiledb/api/examples/quickstart_dense.rs @@ -1,4 +1,4 @@ -extern crate tiledb; +extern crate tiledb_api as tiledb; use std::path::PathBuf; diff --git a/tiledb/api/examples/quickstart_sparse_string.rs b/tiledb/api/examples/quickstart_sparse_string.rs index 3498e566..7b5d6b26 100644 --- a/tiledb/api/examples/quickstart_sparse_string.rs +++ b/tiledb/api/examples/quickstart_sparse_string.rs @@ -1,4 +1,4 @@ -extern crate tiledb; +extern crate tiledb_api as tiledb; use std::path::PathBuf; @@ -6,15 +6,15 @@ use itertools::izip; use tiledb::array::dimension::DimensionConstraints; use tiledb::array::{ - Array, ArrayType, AttributeData, CellOrder, DimensionData, DomainData, - SchemaData, TileOrder, + Array, ArrayType, AttributeBuilder, CellOrder, DimensionBuilder, + DomainBuilder, SchemaBuilder, TileOrder, }; use tiledb::context::Context; use tiledb::query::{ Query, QueryBuilder, ReadBuilder, ReadQuery, ReadQueryBuilder, WriteBuilder, }; +use tiledb::Datatype; use tiledb::Result as TileDBResult; -use tiledb::{Datatype, Factory}; const ARRAY_URI: &str = "quickstart_sparse_string"; @@ -54,36 +54,37 @@ fn main() -> TileDBResult<()> { } fn create_array(ctx: &Context) -> TileDBResult<()> { - let schema = SchemaData { - array_type: ArrayType::Sparse, - domain: DomainData { - dimension: vec![ - DimensionData { - name: "rows".to_owned(), - datatype: Datatype::StringAscii, - constraints: DimensionConstraints::StringAscii, - filters: None, - }, - DimensionData { - name: "cols".to_owned(), - datatype: Datatype::Int32, - constraints: ([1i32, 4], 4i32).into(), - filters: None, - }, - ], - }, - attributes: vec![AttributeData { - name: "a".to_owned(), - datatype: Datatype::Int32, - ..Default::default() - }], - tile_order: Some(TileOrder::RowMajor), - cell_order: Some(CellOrder::RowMajor), - - ..Default::default() + let schema = { + let domain = DomainBuilder::new(ctx)? + .add_dimension( + DimensionBuilder::new( + ctx, + "rows", + Datatype::StringAscii, + DimensionConstraints::StringAscii, + )? + .build(), + )? + .add_dimension( + DimensionBuilder::new( + ctx, + "cols", + Datatype::Int32, + ([1i32, 4], 4i32), + )? + .build(), + )? + .build(); + + SchemaBuilder::new(ctx, ArrayType::Sparse, domain)? + .cell_order(CellOrder::RowMajor)? + .tile_order(TileOrder::RowMajor)? + .add_attribute( + AttributeBuilder::new(ctx, "a", Datatype::Int32)?.build(), + )? + .build()? }; - let schema = schema.create(ctx)?; Array::create(ctx, ARRAY_URI, schema)?; Ok(()) } diff --git a/tiledb/api/examples/reading_incomplete.rs b/tiledb/api/examples/reading_incomplete.rs index 1cd035dd..9d7d19ed 100644 --- a/tiledb/api/examples/reading_incomplete.rs +++ b/tiledb/api/examples/reading_incomplete.rs @@ -1,10 +1,10 @@ -extern crate tiledb; +extern crate tiledb_api as tiledb; use std::cell::{Ref, RefCell}; use std::path::PathBuf; use itertools::izip; -use tiledb::array::{CellOrder, TileOrder}; +use tiledb::array::{CellOrder, Mode, TileOrder}; use tiledb::query::buffer::{ BufferMut, CellStructureMut, QueryBuffers, QueryBuffersMut, }; @@ -103,8 +103,7 @@ fn create_array() -> TileDBResult<()> { fn write_array() -> TileDBResult<()> { let tdb = tiledb::Context::new()?; - let array = - tiledb::Array::open(&tdb, ARRAY_NAME, tiledb::array::Mode::Write)?; + let array = tiledb::Array::open(&tdb, ARRAY_NAME, Mode::Write)?; let coords_rows = vec![1, 2, 2]; let coords_cols = vec![1, 1, 2]; @@ -128,8 +127,7 @@ fn write_array() -> TileDBResult<()> { /// from a query. The example wants to print out the query result set. /// Below are several different ways to implement this functionality. fn query_builder_start(tdb: &tiledb::Context) -> TileDBResult { - let array = - tiledb::Array::open(tdb, ARRAY_NAME, tiledb::array::Mode::Read)?; + let array = tiledb::Array::open(tdb, ARRAY_NAME, Mode::Read)?; tiledb::query::ReadBuilder::new(array)? .layout(tiledb::query::QueryLayout::RowMajor)? diff --git a/tiledb/api/examples/using_tiledb_stats.rs b/tiledb/api/examples/using_tiledb_stats.rs index a3919fc6..9c427845 100644 --- a/tiledb/api/examples/using_tiledb_stats.rs +++ b/tiledb/api/examples/using_tiledb_stats.rs @@ -1,4 +1,4 @@ -extern crate tiledb; +extern crate tiledb_api as tiledb; use std::path::PathBuf; diff --git a/tiledb/api/src/array/attribute/arrow.rs b/tiledb/api/src/array/attribute/arrow.rs index e88bfe2b..2f744614 100644 --- a/tiledb/api/src/array/attribute/arrow.rs +++ b/tiledb/api/src/array/attribute/arrow.rs @@ -3,17 +3,22 @@ use std::collections::HashMap; use anyhow::anyhow; use serde::{Deserialize, Serialize}; use serde_json::json; +use tiledb_common::array::CellValNum; +use tiledb_common::datatype::arrow::{ + DatatypeFromArrowResult, DatatypeToArrowResult, +}; +use tiledb_common::datatype::Datatype; +use tiledb_common::physical_type_go; use crate::array::schema::arrow::{ AttributeFromArrowResult, FieldToArrowResult, }; -use crate::array::{Attribute, AttributeBuilder, CellValNum}; +use crate::array::{Attribute, AttributeBuilder}; use crate::context::ContextBound; -use crate::datatype::arrow::{DatatypeFromArrowResult, DatatypeToArrowResult}; use crate::error::Error; use crate::filter::arrow::FilterMetadata; use crate::filter::FilterListBuilder; -use crate::{physical_type_go, Context, Datatype, Result as TileDBResult}; +use crate::{Context, Result as TileDBResult}; // additional methods with arrow features impl Attribute { @@ -105,7 +110,7 @@ pub fn to_arrow(attr: &Attribute) -> TileDBResult { )]))) }; - let arrow_dt = crate::datatype::arrow::to_arrow( + let arrow_dt = tiledb_common::datatype::arrow::to_arrow( &attr.datatype()?, attr.cell_val_num()?, ); @@ -156,7 +161,7 @@ pub fn from_arrow( } }; - match crate::datatype::arrow::from_arrow(field.data_type()) { + match tiledb_common::datatype::arrow::from_arrow(field.data_type()) { DatatypeFromArrowResult::None => Ok(AttributeFromArrowResult::None), DatatypeFromArrowResult::Inexact(datatype, cell_val_num) => { Ok(AttributeFromArrowResult::Inexact(construct( @@ -179,8 +184,10 @@ pub mod strategy { pub fn prop_arrow_field() -> impl Strategy { ( - crate::array::attribute::strategy::prop_attribute_name(), - crate::datatype::arrow::strategy::any_datatype(Default::default()), + tiledb_pod::array::attribute::strategy::prop_attribute_name(), + tiledb_common::datatype::arrow::strategy::any_datatype( + Default::default(), + ), any::(), Just(HashMap::::new()), /* TODO: we'd like to check that metadata is preserved, * but right now the CAPI doesn't appear to have a way @@ -196,10 +203,11 @@ pub mod strategy { #[cfg(test)] pub mod tests { + use proptest::prelude::*; + use tiledb_pod::array::attribute::AttributeData; + use super::*; - use crate::array::attribute::AttributeData; use crate::Factory; - use proptest::prelude::*; fn do_tiledb_arrow(tdb_spec: AttributeData) { let c: Context = Context::new().unwrap(); @@ -274,9 +282,9 @@ pub mod tests { assert_eq!(arrow_in.is_nullable(), arrow_out.is_nullable()); /* break out some datatypes */ - use crate::datatype::arrow::tests::arrow_datatype_is_inexact_compatible; + use tiledb_common::datatype::arrow::is_physical_type_match; assert!( - arrow_datatype_is_inexact_compatible( + is_physical_type_match( arrow_in.data_type(), arrow_out.data_type() ), @@ -289,7 +297,7 @@ pub mod tests { proptest! { #[test] - fn test_tiledb_arrow(tdb_in in crate::array::attribute::strategy::prop_attribute(Default::default())) { + fn test_tiledb_arrow(tdb_in in tiledb_pod::array::attribute::strategy::prop_attribute(Default::default())) { do_tiledb_arrow(tdb_in); } diff --git a/tiledb/api/src/array/attribute/mod.rs b/tiledb/api/src/array/attribute/mod.rs index 4c77f441..c7c7fc78 100644 --- a/tiledb/api/src/array/attribute/mod.rs +++ b/tiledb/api/src/array/attribute/mod.rs @@ -1,24 +1,21 @@ extern crate tiledb_sys as ffi; use std::borrow::Borrow; -use std::fmt::{Debug, Display, Formatter, Result as FmtResult}; -use std::num::NonZeroU32; use std::ops::Deref; -use anyhow::anyhow; -use serde::{Deserialize, Serialize}; -use serde_json::json; -use util::option::OptionSubset; +#[cfg(any(test, feature = "pod"))] +use std::fmt::{Debug, Formatter, Result as FmtResult}; + +use tiledb_common::array::attribute::{FromFillValue, IntoFillValue}; use crate::array::CellValNum; use crate::context::{CApiInterface, Context, ContextBound}; use crate::datatype::physical::BitsEq; -use crate::datatype::PhysicalType; -use crate::error::{DatatypeErrorKind, Error}; -use crate::filter::list::{FilterList, FilterListData, RawFilterList}; +use crate::error::{DatatypeError, Error}; +use crate::filter::list::{FilterList, RawFilterList}; use crate::physical_type_go; use crate::string::{RawTDBString, TDBString}; -use crate::{Datatype, Factory, Result as TileDBResult}; +use crate::{Datatype, Result as TileDBResult}; pub(crate) enum RawAttribute { Owned(*mut ffi::tiledb_attribute_t), @@ -74,11 +71,11 @@ impl Attribute { } pub fn datatype(&self) -> TileDBResult { - let mut c_dtype: std::ffi::c_uint = 0; + let mut c_dtype: ffi::tiledb_datatype_t = out_ptr!(); self.capi_call(|ctx| unsafe { ffi::tiledb_attribute_get_type(ctx, *self.raw, &mut c_dtype) })?; - Datatype::try_from(c_dtype) + Ok(Datatype::try_from(c_dtype)?) } pub fn is_nullable(&self) -> TileDBResult { @@ -102,11 +99,11 @@ impl Attribute { } pub fn cell_val_num(&self) -> TileDBResult { - let mut c_num: std::ffi::c_uint = 0; + let mut c_num: std::ffi::c_uint = out_ptr!(); self.capi_call(|ctx| unsafe { ffi::tiledb_attribute_get_cell_val_num(ctx, *self.raw, &mut c_num) })?; - CellValNum::try_from(c_num) + Ok(CellValNum::try_from(c_num)?) } pub fn is_var_sized(&self) -> TileDBResult { @@ -127,10 +124,11 @@ impl Attribute { let mut c_size: u64 = 0; if !self.datatype()?.is_compatible_type::() { - return Err(Error::Datatype(DatatypeErrorKind::TypeMismatch { - user_type: std::any::type_name::().to_owned(), - tiledb_type: self.datatype()?, - })); + return Err(Error::Datatype( + DatatypeError::physical_type_incompatible::( + self.datatype()?, + ), + )); } self.capi_call(|ctx| unsafe { @@ -151,17 +149,18 @@ impl Attribute { let slice: &[F::PhysicalType] = unsafe { std::slice::from_raw_parts(c_ptr as *const F::PhysicalType, len) }; - F::from_raw(slice) + Ok(F::from_raw(slice)?) } pub fn fill_value_nullable<'a, F: FromFillValue<'a>>( &'a self, ) -> TileDBResult<(F, bool)> { if !self.datatype()?.is_compatible_type::() { - return Err(Error::Datatype(DatatypeErrorKind::TypeMismatch { - user_type: std::any::type_name::().to_owned(), - tiledb_type: self.datatype()?, - })); + return Err(Error::Datatype( + DatatypeError::physical_type_incompatible::( + self.datatype()?, + ), + )); } if !self.is_nullable()? { /* see comment in Builder::fill_value_nullability */ @@ -217,17 +216,6 @@ impl Attribute { } } -impl Debug for Attribute { - fn fmt(&self, f: &mut Formatter) -> FmtResult { - let data = - AttributeData::try_from(self).map_err(|_| std::fmt::Error)?; - let mut json = json!(data); - json["raw"] = json!(format!("{:p}", *self.raw)); - - write!(f, "{}", json) - } -} - impl PartialEq for Attribute { fn eq(&self, other: &Attribute) -> bool { let names_match = match (self.name(), other.name()) { @@ -303,6 +291,19 @@ impl PartialEq for Attribute { } } +#[cfg(any(test, feature = "pod"))] +impl Debug for Attribute { + fn fmt(&self, f: &mut Formatter) -> FmtResult { + match tiledb_pod::array::attribute::AttributeData::try_from(self) { + Ok(a) => Debug::fmt(&a, f), + Err(e) => { + let RawAttribute::Owned(ptr) = self.raw; + write!(f, "", ptr, e) + } + } + } +} + pub struct Builder { attr: Attribute, } @@ -342,7 +343,7 @@ impl Builder { } pub fn cell_val_num(self, num: CellValNum) -> TileDBResult { - let c_num = num.capi() as std::ffi::c_uint; + let c_num = std::ffi::c_uint::from(num); self.capi_call(|ctx| unsafe { ffi::tiledb_attribute_set_cell_val_num(ctx, *self.attr.raw, c_num) })?; @@ -390,10 +391,11 @@ impl Builder { .datatype()? .is_compatible_type::() { - return Err(Error::Datatype(DatatypeErrorKind::TypeMismatch { - user_type: std::any::type_name::().to_owned(), - tiledb_type: self.attr.datatype()?, - })); + return Err(Error::Datatype( + DatatypeError::physical_type_incompatible::( + self.datatype()?, + ), + )); } let fill: &[F::PhysicalType] = value.to_raw(); @@ -430,10 +432,11 @@ impl Builder { .datatype()? .is_compatible_type::() { - return Err(Error::Datatype(DatatypeErrorKind::TypeMismatch { - user_type: std::any::type_name::().to_owned(), - tiledb_type: self.attr.datatype()?, - })); + return Err(Error::Datatype( + DatatypeError::physical_type_incompatible::( + self.attr.datatype()?, + ), + )); } let fill: &[F::PhysicalType] = value.to_raw(); @@ -473,295 +476,20 @@ impl Builder { } } -/// Trait for data which can be used as a fill value for an attribute. -pub trait IntoFillValue { - type PhysicalType: PhysicalType; - - /// Get a reference to the raw fill value data. - /// The returned slice will be copied into the tiledb core. - fn to_raw(&self) -> &[Self::PhysicalType]; -} - -/// Trait for data which can be constructed from an attribute's raw fill value. -pub trait FromFillValue<'a>: IntoFillValue + Sized { - /// Construct a value of this type from a raw fill value. - fn from_raw(raw: &'a [Self::PhysicalType]) -> TileDBResult; -} - -impl IntoFillValue for T -where - T: PhysicalType, -{ - type PhysicalType = Self; - - fn to_raw(&self) -> &[Self::PhysicalType] { - std::slice::from_ref(self) - } -} - -impl FromFillValue<'_> for T -where - T: PhysicalType, -{ - fn from_raw(raw: &[Self::PhysicalType]) -> TileDBResult { - if raw.len() == 1 { - Ok(raw[0]) - } else { - Err(Error::Datatype(DatatypeErrorKind::UnexpectedCellStructure { - context: None, - found: CellValNum::try_from(raw.len() as u32) - /* this should be safe because core forbids zero-length fill value */ - .unwrap(), - expected: CellValNum::single() - })) - } - } -} - -impl IntoFillValue for [T; K] -where - T: PhysicalType, -{ - type PhysicalType = T; - - fn to_raw(&self) -> &[Self::PhysicalType] { - self - } -} - -impl<'a, T, const K: usize> FromFillValue<'a> for [T; K] -where - T: PhysicalType, -{ - fn from_raw(raw: &'a [Self::PhysicalType]) -> TileDBResult { - Self::try_from(raw).map_err(|_| - Error::Datatype(DatatypeErrorKind::UnexpectedCellStructure { - context: None, - found: CellValNum::try_from(raw.len() as u32) - /* this should be safe because core forbids zero-length fill value */ - .unwrap(), - expected: { - /* unfortunately no clear way to bound `0 < K < u32::MAX` for a trait impl */ - let nz = u32::try_from(K).ok().and_then(NonZeroU32::new) - .expect("`impl FillValue for [T; K] requires 0 < K < u32::MAX"); - CellValNum::Fixed(nz) - } - })) - } -} - -impl IntoFillValue for &[T] -where - T: PhysicalType, -{ - type PhysicalType = T; - - fn to_raw(&self) -> &[Self::PhysicalType] { - self - } -} - -impl<'a, T> FromFillValue<'a> for &'a [T] -where - T: PhysicalType, -{ - fn from_raw(raw: &'a [Self::PhysicalType]) -> TileDBResult { - Ok(raw) - } -} - -impl IntoFillValue for Vec -where - T: PhysicalType, -{ - type PhysicalType = T; - - fn to_raw(&self) -> &[Self::PhysicalType] { - self.as_slice() - } -} - -impl FromFillValue<'_> for Vec -where - T: PhysicalType, -{ - fn from_raw(raw: &[Self::PhysicalType]) -> TileDBResult { - Ok(raw.to_vec()) - } -} - -impl IntoFillValue for &str { - type PhysicalType = u8; - - fn to_raw(&self) -> &[Self::PhysicalType] { - self.as_bytes() - } -} - -impl<'a> FromFillValue<'a> for &'a str { - fn from_raw(raw: &'a [Self::PhysicalType]) -> TileDBResult { - std::str::from_utf8(raw).map_err(|e| { - Error::Deserialization( - "Non-UTF8 fill value".to_string(), - anyhow!(e), - ) - }) - } -} - -impl IntoFillValue for String { - type PhysicalType = u8; - - fn to_raw(&self) -> &[Self::PhysicalType] { - self.as_bytes() - } -} - -impl<'a> FromFillValue<'a> for String { - fn from_raw(raw: &'a [Self::PhysicalType]) -> TileDBResult { - <&'a str as FromFillValue<'a>>::from_raw(raw).map(|s| s.to_string()) - } -} - -/// Encapsulation of data needed to construct an Attribute's fill value -#[derive(Clone, Debug, Deserialize, OptionSubset, PartialEq, Serialize)] -pub struct FillData { - pub data: crate::metadata::Value, - pub nullability: Option, -} - -/// Encapsulation of data needed to construct an Attribute -#[derive( - Clone, Default, Debug, Deserialize, OptionSubset, Serialize, PartialEq, -)] -pub struct AttributeData { - pub name: String, - pub datatype: Datatype, - pub nullability: Option, - pub cell_val_num: Option, - pub fill: Option, - pub filters: FilterListData, -} - -#[cfg(any(test, feature = "proptest-strategies"))] -impl AttributeData { - /// Returns a strategy for generating values of this attribute's type. - pub fn value_strategy(&self) -> crate::query::strategy::FieldValueStrategy { - use crate::query::strategy::FieldValueStrategy; - use proptest::prelude::*; - - use crate::filter::{CompressionData, CompressionType, FilterData}; - let has_double_delta = self.filters.iter().any(|f| { - matches!( - f, - FilterData::Compression(CompressionData { - kind: CompressionType::DoubleDelta { .. }, - .. - }) - ) - }); - - physical_type_go!(self.datatype, DT, { - if has_double_delta { - if std::any::TypeId::of::
() == std::any::TypeId::of::() - { - // see core `DoubleDelta::compute_bitsize` - let min = 0u64; - let max = u64::MAX >> 1; - return FieldValueStrategy::from((min..=max).boxed()); - } else if std::any::TypeId::of::
() - == std::any::TypeId::of::() - { - let min = i64::MIN >> 2; - let max = i64::MAX >> 2; - return FieldValueStrategy::from((min..=max).boxed()); - } - } - FieldValueStrategy::from(any::
().boxed()) - }) - } -} - -impl Display for AttributeData { - fn fmt(&self, f: &mut Formatter) -> FmtResult { - write!(f, "{}", json!(*self)) - } -} - -impl TryFrom<&Attribute> for AttributeData { - type Error = crate::error::Error; - - fn try_from(attr: &Attribute) -> TileDBResult { - let datatype = attr.datatype()?; - let fill = physical_type_go!(datatype, DT, { - let (fill_value, fill_value_nullability) = - attr.fill_value_nullable::<&[DT]>()?; - FillData { - data: fill_value.to_vec().into(), - nullability: Some(fill_value_nullability), - } - }); - - Ok(AttributeData { - name: attr.name()?, - datatype, - nullability: Some(attr.is_nullable()?), - cell_val_num: Some(attr.cell_val_num()?), - fill: Some(fill), - filters: FilterListData::try_from(&attr.filter_list()?)?, - }) - } -} - -impl TryFrom for AttributeData { - type Error = crate::error::Error; - - fn try_from(attr: Attribute) -> TileDBResult { - Self::try_from(&attr) - } -} - -impl Factory for AttributeData { - type Item = Attribute; - - fn create(&self, context: &Context) -> TileDBResult { - let mut b = Builder::new(context, &self.name, self.datatype)? - .filter_list(self.filters.create(context)?)?; - - if let Some(n) = self.nullability { - b = b.nullability(n)?; - } - if let Some(c) = self.cell_val_num { - if !matches!((self.datatype, c), (Datatype::Any, CellValNum::Var)) { - /* SC-46696 */ - b = b.cell_val_num(c)?; - } - } - if let Some(ref fill) = self.fill { - b = crate::metadata::value_go!(fill.data, _DT, ref value, { - if let Some(fill_nullability) = fill.nullability { - b.fill_value_nullability(value.as_slice(), fill_nullability) - } else { - b.fill_value(value.as_slice()) - } - })?; - } - - Ok(b.build()) - } -} - #[cfg(feature = "arrow")] pub mod arrow; -#[cfg(any(test, feature = "proptest-strategies"))] -pub mod strategy; +#[cfg(any(test, feature = "pod"))] +pub mod pod; #[cfg(test)] -mod test { +mod tests { + use tiledb_pod::array::attribute::AttributeData; + use super::*; use crate::filter::list::Builder as FilterListBuilder; use crate::filter::*; + use crate::Factory; /// Test what the default values filled in for `None` with attribute data are. /// Mostly because if we write code which does need the default, we're expecting diff --git a/tiledb/api/src/array/attribute/pod.rs b/tiledb/api/src/array/attribute/pod.rs new file mode 100644 index 00000000..eedf54e6 --- /dev/null +++ b/tiledb/api/src/array/attribute/pod.rs @@ -0,0 +1,105 @@ +use tiledb_common::array::CellValNum; +use tiledb_common::datatype::Datatype; +use tiledb_common::filter::FilterData; +use tiledb_common::{metadata_value_go, physical_type_go}; +use tiledb_pod::array::attribute::{AttributeData, FillData}; + +use super::{Attribute, Builder}; +use crate::error::Error as TileDBError; +use crate::{Context, Factory, Result as TileDBResult}; + +impl TryFrom<&Attribute> for AttributeData { + type Error = TileDBError; + + fn try_from(attr: &Attribute) -> Result { + let datatype = attr.datatype()?; + let fill = physical_type_go!(datatype, DT, { + let (fill_value, fill_value_nullability) = + attr.fill_value_nullable::<&[DT]>()?; + FillData { + data: fill_value.to_vec().into(), + nullability: Some(fill_value_nullability), + } + }); + + Ok(AttributeData { + name: attr.name()?, + datatype, + nullability: Some(attr.is_nullable()?), + cell_val_num: Some(attr.cell_val_num()?), + fill: Some(fill), + filters: Vec::::try_from(&attr.filter_list()?)?, + }) + } +} + +impl TryFrom for AttributeData { + type Error = TileDBError; + + fn try_from(attribute: Attribute) -> Result { + Self::try_from(&attribute) + } +} + +impl Factory for AttributeData { + type Item = Attribute; + + fn create(&self, context: &Context) -> TileDBResult { + let mut b = Builder::new(context, &self.name, self.datatype)? + .filter_list(self.filters.create(context)?)?; + + if let Some(n) = self.nullability { + b = b.nullability(n)?; + } + if let Some(c) = self.cell_val_num { + if !matches!((self.datatype, c), (Datatype::Any, CellValNum::Var)) { + /* SC-46696 */ + b = b.cell_val_num(c)?; + } + } + if let Some(ref fill) = self.fill { + b = metadata_value_go!(fill.data, _DT, ref value, { + if let Some(fill_nullability) = fill.nullability { + b.fill_value_nullability(value.as_slice(), fill_nullability) + } else { + b.fill_value(value.as_slice()) + } + })?; + } + + Ok(b.build()) + } +} + +#[cfg(test)] +mod tests { + use proptest::prelude::*; + use utils::assert_option_subset; + + use super::*; + use crate::{Context, Factory}; + + /// Test that the arbitrary attribute construction always succeeds + #[test] + fn attribute_arbitrary() { + let ctx = Context::new().expect("Error creating context"); + + proptest!(|(attr in any::())| { + attr.create(&ctx).expect("Error constructing arbitrary attribute"); + }); + } + + #[test] + fn attribute_eq_reflexivity() { + let ctx = Context::new().expect("Error creating context"); + + proptest!(|(attr in any::())| { + assert_eq!(attr, attr); + assert_option_subset!(attr, attr); + + let attr = attr.create(&ctx) + .expect("Error constructing arbitrary attribute"); + assert_eq!(attr, attr); + }); + } +} diff --git a/tiledb/api/src/array/dimension/arrow.rs b/tiledb/api/src/array/dimension/arrow.rs index bf80ad3c..0ba6456a 100644 --- a/tiledb/api/src/array/dimension/arrow.rs +++ b/tiledb/api/src/array/dimension/arrow.rs @@ -3,6 +3,10 @@ use std::collections::HashMap; use anyhow::anyhow; use serde::{Deserialize, Serialize}; use serde_json::json; +use tiledb_common::datatype::arrow::{ + DatatypeFromArrowResult, DatatypeToArrowResult, +}; +use tiledb_common::physical_type_go; use crate::array::dimension::DimensionConstraints; use crate::array::schema::arrow::{ @@ -10,12 +14,9 @@ use crate::array::schema::arrow::{ }; use crate::array::{Dimension, DimensionBuilder}; use crate::context::{Context as TileDBContext, ContextBound}; -use crate::datatype::arrow::{DatatypeFromArrowResult, DatatypeToArrowResult}; use crate::filter::arrow::FilterMetadata; use crate::filter::FilterListBuilder; -use crate::{ - error::Error as TileDBError, physical_type_go, Result as TileDBResult, -}; +use crate::{error::Error as TileDBError, Result as TileDBResult}; // additional methods with arrow features impl Dimension { @@ -59,8 +60,10 @@ impl DimensionMetadata { /// Details about the Dimension are stored under the key "tiledb" /// in the Field's metadata. pub fn to_arrow(dim: &Dimension) -> TileDBResult { - let arrow_dt = - crate::datatype::arrow::to_arrow(&dim.datatype()?, dim.cell_val_num()?); + let arrow_dt = tiledb_common::datatype::arrow::to_arrow( + &dim.datatype()?, + dim.cell_val_num()?, + ); let construct = |adt| -> TileDBResult { let name = dim.name()?; @@ -168,7 +171,7 @@ pub fn from_arrow( dim.cell_val_num(cell_val_num)?.filters(fl) }; - match crate::datatype::arrow::from_arrow(field.data_type()) { + match tiledb_common::datatype::arrow::from_arrow(field.data_type()) { DatatypeFromArrowResult::None => Ok(DimensionFromArrowResult::None), DatatypeFromArrowResult::Inexact(datatype, cell_val_num) => { Ok(DimensionFromArrowResult::Inexact(construct( @@ -184,10 +187,11 @@ pub fn from_arrow( #[cfg(test)] mod tests { + use proptest::prelude::*; + use tiledb_pod::array::dimension::DimensionData; + use super::*; - use crate::array::dimension::DimensionData; use crate::{Datatype, Factory}; - use proptest::prelude::*; fn do_to_arrow(tdb_in: DimensionData) { let c: TileDBContext = TileDBContext::new().unwrap(); diff --git a/tiledb/api/src/array/dimension/mod.rs b/tiledb/api/src/array/dimension/mod.rs index 58fbca0e..14230fd2 100644 --- a/tiledb/api/src/array/dimension/mod.rs +++ b/tiledb/api/src/array/dimension/mod.rs @@ -1,17 +1,16 @@ -use std::fmt::{Debug, Display, Formatter, Result as FmtResult}; use std::ops::Deref; -use serde::{Deserialize, Serialize}; -use serde_json::json; -use util::option::OptionSubset; +#[cfg(any(test, feature = "pod"))] +use std::fmt::{Debug, Formatter, Result as FmtResult}; use crate::array::CellValNum; use crate::context::{CApiInterface, Context, ContextBound}; use crate::datatype::PhysicalType; -use crate::error::{DatatypeErrorKind, Error}; -use crate::filter::list::{FilterList, FilterListData, RawFilterList}; -use crate::range::SingleValueRange; -use crate::{physical_type_go, Datatype, Factory, Result as TileDBResult}; +use crate::filter::list::{FilterList, RawFilterList}; +use crate::{physical_type_go, Datatype, Result as TileDBResult}; + +pub use tiledb_common::array::dimension::DimensionConstraints; +pub use tiledb_common::dimension_constraints_go; pub(crate) enum RawDimension { Owned(*mut ffi::tiledb_dimension_t), @@ -73,7 +72,7 @@ impl Dimension { ffi::tiledb_dimension_get_type(ctx, c_dimension, &mut c_datatype) })?; - Datatype::try_from(c_datatype) + Ok(Datatype::try_from(c_datatype)?) } pub fn cell_val_num(&self) -> TileDBResult { @@ -81,7 +80,7 @@ impl Dimension { self.capi_call(|ctx| unsafe { ffi::tiledb_dimension_get_cell_val_num(ctx, *self.raw, &mut c_num) })?; - CellValNum::try_from(c_num) + Ok(CellValNum::try_from(c_num)?) } pub fn is_var_sized(&self) -> TileDBResult { @@ -143,17 +142,6 @@ impl Dimension { } } -impl Debug for Dimension { - fn fmt(&self, f: &mut Formatter) -> FmtResult { - let data = - DimensionData::try_from(self).map_err(|_| std::fmt::Error)?; - let mut json = json!(data); - json["raw"] = json!(format!("{:p}", *self.raw)); - - write!(f, "{}", json) - } -} - impl PartialEq for Dimension { fn eq(&self, other: &Dimension) -> bool { eq_helper!(self.name(), other.name()); @@ -170,275 +158,16 @@ impl PartialEq for Dimension { } } -#[derive(Clone, Debug, Deserialize, OptionSubset, PartialEq, Serialize)] -pub enum DimensionConstraints { - Int8([i8; 2], Option), - Int16([i16; 2], Option), - Int32([i32; 2], Option), - Int64([i64; 2], Option), - UInt8([u8; 2], Option), - UInt16([u16; 2], Option), - UInt32([u32; 2], Option), - UInt64([u64; 2], Option), - Float32([f32; 2], Option), - Float64([f64; 2], Option), - StringAscii, -} - -#[macro_export] -macro_rules! dimension_constraints_go { - ($expr:expr, $DT:ident, $range:pat, $extent:pat, $then:expr, $string:expr) => {{ - dimension_constraints_go!( - $expr, $DT, $range, $extent, $then, $then, $string - ) - }}; - ($expr:expr, $DT:ident, $range:pat, $extent:pat, $integral:expr, $float:expr, $string:expr) => {{ - use $crate::array::dimension::DimensionConstraints; - match $expr { - #[allow(unused_variables)] - DimensionConstraints::Int8($range, $extent) => { - #[allow(dead_code)] - type $DT = i8; - $integral - } - #[allow(unused_variables)] - DimensionConstraints::Int16($range, $extent) => { - #[allow(dead_code)] - type $DT = i16; - $integral - } - #[allow(unused_variables)] - DimensionConstraints::Int32($range, $extent) => { - #[allow(dead_code)] - type $DT = i32; - $integral - } - #[allow(unused_variables)] - DimensionConstraints::Int64($range, $extent) => { - #[allow(dead_code)] - type $DT = i64; - $integral - } - #[allow(unused_variables)] - DimensionConstraints::UInt8($range, $extent) => { - #[allow(dead_code)] - type $DT = u8; - $integral - } - #[allow(unused_variables)] - DimensionConstraints::UInt16($range, $extent) => { - #[allow(dead_code)] - type $DT = u16; - $integral - } - #[allow(unused_variables)] - DimensionConstraints::UInt32($range, $extent) => { - #[allow(dead_code)] - type $DT = u32; - $integral - } - #[allow(unused_variables)] - DimensionConstraints::UInt64($range, $extent) => { - #[allow(dead_code)] - type $DT = u64; - $integral - } - #[allow(unused_variables)] - DimensionConstraints::Float32($range, $extent) => { - #[allow(dead_code)] - type $DT = f32; - $float - } - #[allow(unused_variables)] - DimensionConstraints::Float64($range, $extent) => { - #[allow(dead_code)] - type $DT = f64; - $float +#[cfg(any(test, feature = "pod"))] +impl Debug for Dimension { + fn fmt(&self, f: &mut Formatter) -> FmtResult { + match tiledb_pod::array::dimension::DimensionData::try_from(self) { + Ok(d) => Debug::fmt(&d, f), + Err(e) => { + let RawDimension::Owned(ptr) = self.raw; + write!(f, "", ptr, e) } - DimensionConstraints::StringAscii => $string, } - }}; -} - -macro_rules! dimension_constraints_impl { - ($($V:ident : $U:ty),+) => { - $( - impl From<[$U; 2]> for DimensionConstraints { - fn from(value: [$U; 2]) -> DimensionConstraints { - DimensionConstraints::$V(value, None) - } - } - - impl From<&[$U; 2]> for DimensionConstraints { - fn from(value: &[$U; 2]) -> DimensionConstraints { - DimensionConstraints::$V([value[0], value[1]], None) - } - } - - impl From<([$U; 2], $U)> for DimensionConstraints { - fn from(value: ([$U; 2], $U)) -> DimensionConstraints { - DimensionConstraints::$V([value.0[0], value.0[1]], Some(value.1)) - } - } - - impl From<(&[$U; 2], $U)> for DimensionConstraints { - fn from(value: (&[$U; 2], $U)) -> DimensionConstraints { - DimensionConstraints::$V([value.0[0], value.0[1]], Some(value.1)) - } - } - - impl From<([$U; 2], Option<$U>)> for DimensionConstraints { - fn from(value: ([$U; 2], Option<$U>)) -> DimensionConstraints { - DimensionConstraints::$V([value.0[0], value.0[1]], value.1) - } - } - - impl From<(&[$U; 2], Option<$U>)> for DimensionConstraints { - fn from(value: (&[$U; 2], Option<$U>)) -> DimensionConstraints { - DimensionConstraints::$V([value.0[0], value.0[1]], value.1) - } - } - )+ - } -} - -dimension_constraints_impl!(Int8: i8, Int16: i16, Int32: i32, Int64: i64); -dimension_constraints_impl!(UInt8: u8, UInt16: u16, UInt32: u32, UInt64: u64); -dimension_constraints_impl!(Float32: f32, Float64: f64); - -impl DimensionConstraints { - /// Returns a [Datatype] which represents the physical type of this constraint. - pub fn physical_datatype(&self) -> Datatype { - match self { - Self::UInt8(_, _) => Datatype::UInt8, - Self::UInt16(_, _) => Datatype::UInt16, - Self::UInt32(_, _) => Datatype::UInt32, - Self::UInt64(_, _) => Datatype::UInt64, - Self::Int8(_, _) => Datatype::Int8, - Self::Int16(_, _) => Datatype::Int16, - Self::Int32(_, _) => Datatype::Int32, - Self::Int64(_, _) => Datatype::Int64, - Self::Float32(_, _) => Datatype::Float32, - Self::Float64(_, _) => Datatype::Float64, - Self::StringAscii => Datatype::StringAscii, - } - } - - pub fn cell_val_num(&self) -> CellValNum { - match self { - DimensionConstraints::StringAscii => CellValNum::Var, - _ => CellValNum::single(), - } - } - - pub fn verify_type_compatible( - &self, - datatype: Datatype, - ) -> TileDBResult<()> { - dimension_constraints_go!( - self, - DT, - _range, - _extent, - { - if !datatype.is_compatible_type::
() { - return Err(Error::Datatype( - DatatypeErrorKind::TypeMismatch { - user_type: std::any::type_name::
().to_owned(), - tiledb_type: datatype, - }, - )); - } - }, - { - if !matches!(datatype, Datatype::StringAscii) { - return Err(Error::Datatype( - DatatypeErrorKind::InvalidDatatype { - context: Some( - "DimensionConstraints::StringAscii".to_owned(), - ), - found: datatype, - expected: Datatype::StringAscii, - }, - )); - } - } - ); - - Ok(()) - } - - pub(crate) fn domain_ptr(&self) -> *const std::ffi::c_void { - dimension_constraints_go!( - self, - DT, - range, - _extent, - range.as_ptr() as *const DT as *const std::ffi::c_void, - std::ptr::null() - ) - } - - pub(crate) fn extent_ptr(&self) -> *const std::ffi::c_void { - dimension_constraints_go!( - self, - DT, - _range, - extent, - { - if let Some(extent) = extent { - extent as *const DT as *const std::ffi::c_void - } else { - std::ptr::null() - } - }, - std::ptr::null() - ) - } - - /// Returns the number of cells spanned by this constraint, if applicable - pub fn num_cells(&self) -> Option { - let (low, high) = crate::dimension_constraints_go!( - self, - _DT, - [low, high], - _, - (i128::from(*low), i128::from(*high)), - return None, - return None - ); - - Some(1 + (high - low) as u128) - } - - /// Returns the number of cells spanned by a - /// single tile under this constraint, if applicable - pub fn num_cells_per_tile(&self) -> Option { - crate::dimension_constraints_go!( - self, - _DT, - _, - extent, - extent.map(|extent| { - #[allow(clippy::unnecessary_fallible_conversions)] - // this `unwrap` should be safe, validation will confirm nonzero - usize::try_from(extent).unwrap() - }), - None, - None - ) - } - - /// Returns the domain of the dimension constraint, if present, as a range. - pub fn domain(&self) -> Option { - crate::dimension_constraints_go!( - self, - _DT, - [low, high], - _, - Some(SingleValueRange::from(&[*low, *high])), - None - ) } } @@ -462,10 +191,30 @@ impl Builder { let constraints = constraints.into(); constraints.verify_type_compatible(datatype)?; - let c_datatype = datatype.capi_enum(); + let c_datatype = ffi::tiledb_datatype_t::from(datatype); let c_name = cstring!(name); - let c_domain = constraints.domain_ptr(); - let c_extent = constraints.extent_ptr(); + let c_domain = dimension_constraints_go!( + constraints, + DT, + ref range, + ref _extent, + range.as_ptr() as *const DT as *const std::ffi::c_void, + std::ptr::null() + ); + let c_extent = dimension_constraints_go!( + constraints, + DT, + ref _range, + ref extent, + { + if let Some(extent) = extent { + extent as *const DT as *const std::ffi::c_void + } else { + std::ptr::null() + } + }, + std::ptr::null() + ); let mut c_dimension: *mut ffi::tiledb_dimension_t = std::ptr::null_mut(); @@ -493,7 +242,7 @@ impl Builder { } pub fn cell_val_num(self, num: CellValNum) -> TileDBResult { - let c_num = num.capi() as std::ffi::c_uint; + let c_num = std::ffi::c_uint::from(num); self.capi_call(|ctx| unsafe { ffi::tiledb_dimension_set_cell_val_num(ctx, *self.dim.raw, c_num) })?; @@ -521,216 +270,22 @@ impl From for Dimension { } } -/// Encapsulation of data needed to construct a Dimension -#[derive(Clone, Debug, Deserialize, OptionSubset, PartialEq, Serialize)] -pub struct DimensionData { - pub name: String, - pub datatype: Datatype, - pub constraints: DimensionConstraints, - - /// Optional filters to apply to the dimension. If None or Some(empty), - /// then filters will be inherited from the schema's `coordinate_filters` - /// field when the array is constructed. - pub filters: Option, -} - -impl DimensionData { - pub fn cell_val_num(&self) -> CellValNum { - self.constraints.cell_val_num() - } -} - -#[cfg(any(test, feature = "proptest-strategies"))] -impl DimensionData { - /// Returns a strategy for generating values of this dimension's type - /// which fall within the domain of this dimension. - pub fn value_strategy(&self) -> crate::query::strategy::FieldValueStrategy { - use crate::query::strategy::FieldValueStrategy; - use proptest::prelude::*; - - dimension_constraints_go!( - self.constraints, - DT, - ref domain, - _, - FieldValueStrategy::from((domain[0]..=domain[1]).boxed()), - { - assert_eq!(self.datatype, Datatype::StringAscii); - FieldValueStrategy::from(any::().boxed()) - } - ) - } - - /// Returns a strategy for generating subarray ranges which fall within - /// the domain of this dimension. - /// - /// `cell_bound` is an optional restriction on the number of possible values - /// which the strategy is allowed to return. - /// - /// If `cell_bound` is `None`, then this function always returns `Some`. - pub fn subarray_strategy( - &self, - cell_bound: Option, - ) -> Option> { - use proptest::prelude::Just; - use proptest::strategy::Strategy; - - use crate::range::{Range, VarValueRange}; - - dimension_constraints_go!( - self.constraints, - DT, - ref domain, - _, - { - let cell_bound = cell_bound - .map(|bound| DT::try_from(bound).unwrap_or(DT::MAX)) - .unwrap_or(DT::MAX); - - let domain_lower = domain[0]; - let domain_upper = domain[1]; - let strat = - (domain_lower..=domain_upper).prop_flat_map(move |lb| { - let ub = std::cmp::min( - domain_upper, - lb.checked_add(cell_bound).unwrap_or(DT::MAX), - ); - (Just(lb), lb..=ub).prop_map(|(min, max)| { - Range::Single(SingleValueRange::from(&[min, max])) - }) - }); - Some(strat.boxed()) - }, - { - if cell_bound.is_some() { - /* - * This can be implemented, but there's some ambiguity about - * what it should mean when precision goes out the window, - * so wait until there's a use case to decide. - */ - return None; - } - - let domain_lower = domain[0]; - let domain_upper = domain[1]; - let strat = - (domain_lower..=domain_upper).prop_flat_map(move |lb| { - (Just(lb), (lb..=domain_upper)).prop_map( - |(min, max)| { - Range::Single(SingleValueRange::from(&[ - min, max, - ])) - }, - ) - }); - Some(strat.boxed()) - }, - { - // DimensionConstraints::StringAscii - let strat_bound = - proptest::string::string_regex("[ -~]*").unwrap().boxed(); - - if cell_bound.is_some() { - /* - * This is not tractible unless there is a bound on the string length. - * There isn't one since `StringAscii` is only allowed as a dimension - * type in sparse arrays. - */ - return None; - } - - let strat = (strat_bound.clone(), strat_bound).prop_map( - |(ascii1, ascii2)| { - let (lb, ub) = if ascii1 < ascii2 { - (ascii1, ascii2) - } else { - (ascii2, ascii1) - }; - Range::Var(VarValueRange::from((lb, ub))) - }, - ); - Some(strat.boxed()) - } - ) - } -} - -impl Display for DimensionData { - fn fmt(&self, f: &mut Formatter) -> FmtResult { - write!(f, "{}", json!(*self)) - } -} - -impl TryFrom<&Dimension> for DimensionData { - type Error = crate::error::Error; - - fn try_from(dim: &Dimension) -> TileDBResult { - let datatype = dim.datatype()?; - let constraints = physical_type_go!(datatype, DT, { - let domain = dim.domain::
()?; - let extent = dim.extent::
()?; - if let Some(domain) = domain { - DimensionConstraints::from((domain, extent)) - } else { - assert!(extent.is_none()); - DimensionConstraints::StringAscii - } - }); - - Ok(DimensionData { - name: dim.name()?, - datatype, - constraints, - filters: { - let fl = FilterListData::try_from(&dim.filters()?)?; - if fl.is_empty() { - None - } else { - Some(fl) - } - }, - }) - } -} - -impl TryFrom for DimensionData { - type Error = crate::error::Error; - - fn try_from(dim: Dimension) -> TileDBResult { - Self::try_from(&dim) - } -} - -impl Factory for DimensionData { - type Item = Dimension; - - fn create(&self, context: &Context) -> TileDBResult { - let mut b = Builder::new( - context, - &self.name, - self.datatype, - self.constraints.clone(), - )?; - - if let Some(fl) = self.filters.as_ref() { - b = b.filters(fl.create(context)?)?; - } - - Ok(b.cell_val_num(self.cell_val_num())?.build()) - } -} - #[cfg(feature = "arrow")] pub mod arrow; -#[cfg(any(test, feature = "proptest-strategies"))] -pub mod strategy; +#[cfg(any(test, feature = "pod"))] +pub mod pod; #[cfg(test)] mod tests { - use crate::array::dimension::*; + use proptest::prelude::*; + use tiledb_pod::array::dimension::DimensionData; + use utils::assert_option_subset; + + use super::*; use crate::filter::list::Builder as FilterListBuilder; use crate::filter::*; + use crate::Factory; #[test] fn test_dimension_alloc() { @@ -970,92 +525,28 @@ mod tests { } } + /// Test that the arbitrary dimension construction always succeeds #[test] - fn subarray_strategy_dense() { - use super::strategy::Requirements; - use crate::array::ArrayType; - use crate::range::{Range, SingleValueRange}; - use proptest::prelude::*; - use proptest::strategy::Strategy; - use std::rc::Rc; - - let req = Requirements { - array_type: Some(ArrayType::Dense), - ..Default::default() - }; - let strat = ( - any_with::(req), - prop_oneof![Just(None), any::().prop_map(Some)], - ) - .prop_flat_map(|(d, cell_bound)| { - let subarray_strat = d - .subarray_strategy(cell_bound) - .expect("Dense dimension must have a subarray strategy"); - (Just(Rc::new(d)), Just(cell_bound), subarray_strat) - }); - - proptest!(|((d, cell_bound, s) in strat)| { - if let Some(bound) = cell_bound { - assert!(s.num_cells().unwrap() <= bound as u128); - } - if let Some(num_cells) = d.constraints.num_cells() { - assert!(s.num_cells().unwrap() <= num_cells); - } - let Range::Single(s) = s else { - unreachable!("Unexpected range for dense dimension: {:?}", s) - }; - let (start, end) = match s { - SingleValueRange::Int8(start, end) => { - let DimensionConstraints::Int8([lb, ub], _) = d.constraints else { unreachable!() }; - assert!(lb <= start); - assert!(end <= ub); - (start as i128, end as i128) - } - SingleValueRange::Int16(start, end) => { - let DimensionConstraints::Int16([lb, ub], _) = d.constraints else { unreachable!() }; - assert!(lb <= start); - assert!(end <= ub); - (start as i128, end as i128) - } - SingleValueRange::Int32(start, end) => { - let DimensionConstraints::Int32([lb, ub], _) = d.constraints else { unreachable!() }; - assert!(lb <= start); - assert!(end <= ub); - (start as i128, end as i128) - } - SingleValueRange::Int64(start, end) => { - let DimensionConstraints::Int64([lb, ub], _) = d.constraints else { unreachable!() }; - assert!(lb <= start); - assert!(end <= ub); - (start as i128, end as i128) - } - SingleValueRange::UInt8(start, end) => { - let DimensionConstraints::UInt8([lb, ub], _) = d.constraints else { unreachable!() }; - assert!(lb <= start); - assert!(end <= ub); - (start as i128, end as i128) - } - SingleValueRange::UInt16(start, end) => { - let DimensionConstraints::UInt16([lb, ub], _) = d.constraints else { unreachable!() }; - assert!(lb <= start); - assert!(end <= ub); - (start as i128, end as i128) - } - SingleValueRange::UInt32(start, end) => { - let DimensionConstraints::UInt32([lb, ub], _) = d.constraints else { unreachable!() }; - assert!(lb <= start); - assert!(end <= ub); - (start as i128, end as i128) - } - SingleValueRange::UInt64(start, end) => { - let DimensionConstraints::UInt64([lb, ub], _) = d.constraints else { unreachable!() }; - assert!(lb <= start); - assert!(end <= ub); - (start as i128, end as i128) - }, - s => unreachable!("Unexpected range type for dense dimension: {:?}", s) - }; - assert_eq!(Some((end - start + 1) as u128), s.num_cells()); + fn test_prop_dimension() { + let ctx = Context::new().expect("Error creating context"); + + proptest!(|(maybe_dimension in any::())| { + maybe_dimension.create(&ctx) + .expect("Error constructing arbitrary dimension"); + }); + } + + #[test] + fn dimension_eq_reflexivity() { + let ctx = Context::new().expect("Error creating context"); + + proptest!(|(dimension in any::())| { + assert_eq!(dimension, dimension); + assert_option_subset!(dimension, dimension); + + let dimension = dimension + .create(&ctx).expect("Error constructing arbitrary attribute"); + assert_eq!(dimension, dimension); }); } } diff --git a/tiledb/api/src/array/dimension/pod.rs b/tiledb/api/src/array/dimension/pod.rs new file mode 100644 index 00000000..b13d5129 --- /dev/null +++ b/tiledb/api/src/array/dimension/pod.rs @@ -0,0 +1,67 @@ +use tiledb_common::array::dimension::DimensionConstraints; +use tiledb_common::filter::FilterData; +use tiledb_common::physical_type_go; +use tiledb_pod::array::dimension::DimensionData; + +use super::{Builder, Dimension}; +use crate::error::Error as TileDBError; +use crate::{Context, Factory, Result as TileDBResult}; + +impl TryFrom<&Dimension> for DimensionData { + type Error = TileDBError; + + fn try_from(dim: &Dimension) -> Result { + let datatype = dim.datatype()?; + let constraints = physical_type_go!(datatype, DT, { + let domain = dim.domain::
()?; + let extent = dim.extent::
()?; + if let Some(domain) = domain { + DimensionConstraints::from((domain, extent)) + } else { + assert!(extent.is_none()); + DimensionConstraints::StringAscii + } + }); + + Ok(DimensionData { + name: dim.name()?, + datatype, + constraints, + filters: { + let fl = Vec::::try_from(&dim.filters()?)?; + if fl.is_empty() { + None + } else { + Some(fl) + } + }, + }) + } +} + +impl TryFrom for DimensionData { + type Error = TileDBError; + + fn try_from(dimension: Dimension) -> Result { + Self::try_from(&dimension) + } +} + +impl Factory for DimensionData { + type Item = Dimension; + + fn create(&self, context: &Context) -> TileDBResult { + let mut b = Builder::new( + context, + &self.name, + self.datatype, + self.constraints.clone(), + )?; + + if let Some(fl) = self.filters.as_ref() { + b = b.filters(fl.create(context)?)?; + } + + Ok(b.cell_val_num(self.cell_val_num())?.build()) + } +} diff --git a/tiledb/api/src/array/domain/mod.rs b/tiledb/api/src/array/domain/mod.rs index 587a2f98..f87ee47c 100644 --- a/tiledb/api/src/array/domain/mod.rs +++ b/tiledb/api/src/array/domain/mod.rs @@ -1,19 +1,15 @@ -use std::fmt::{Debug, Display, Formatter, Result as FmtResult}; use std::ops::Deref; +#[cfg(any(test, feature = "pod"))] +use std::fmt::{Debug, Formatter, Result as FmtResult}; + use anyhow::anyhow; -use serde::{Deserialize, Serialize}; -use serde_json::json; -use util::option::OptionSubset; -use crate::array::{ - dimension::DimensionData, dimension::RawDimension, Dimension, -}; +use crate::array::dimension::{Dimension, RawDimension}; use crate::context::{CApiInterface, Context, ContextBound}; use crate::error::Error; use crate::key::LookupKey; -use crate::range::{NonEmptyDomain, Range}; -use crate::{Factory, Result as TileDBResult}; +use crate::Result as TileDBResult; pub(crate) enum RawDomain { Owned(*mut ffi::tiledb_domain_t), @@ -162,16 +158,6 @@ impl Domain { } } -impl Debug for Domain { - fn fmt(&self, f: &mut Formatter) -> FmtResult { - let data = DomainData::try_from(self).map_err(|_| std::fmt::Error)?; - let mut json = json!(data); - json["raw"] = json!(format!("{:p}", *self.raw)); - - write!(f, "{}", json) - } -} - impl PartialEq for Domain { fn eq(&self, other: &Domain) -> bool { let ndim_match = match (self.num_dimensions(), other.num_dimensions()) { @@ -230,6 +216,19 @@ impl Iterator for Dimensions<'_> { } } +#[cfg(any(test, feature = "pod"))] +impl Debug for Domain { + fn fmt(&self, f: &mut Formatter) -> FmtResult { + match tiledb_pod::array::domain::DomainData::try_from(self) { + Ok(d) => Debug::fmt(&d, f), + Err(e) => { + let RawDomain::Owned(ptr) = self.raw; + write!(f, "", ptr, e) + } + } + } +} + pub struct Builder { domain: Domain, } @@ -277,111 +276,19 @@ impl From for Domain { } } -/// Encapsulation of data needed to construct a Domain -#[derive( - Clone, Default, Debug, Deserialize, OptionSubset, PartialEq, Serialize, -)] -pub struct DomainData { - pub dimension: Vec, -} - -impl DomainData { - /// Returns the total number of cells spanned by all dimensions, - /// or `None` if: - /// - any dimension is not constrained into a domain; or - /// - the total number of cells exceeds `usize::MAX`. - pub fn num_cells(&self) -> Option { - let mut total = 1u128; - for d in self.dimension.iter() { - total = total.checked_mul(d.constraints.num_cells()?)?; - } - usize::try_from(total).ok() - } - - /// Returns the number of cells in each tile, or `None` if: - /// - any dimension does not have a tile extent specified (e.g. for a sparse array); or - /// - the number of cells in a tile exceeds `usize::MAX`. - pub fn num_cells_per_tile(&self) -> Option { - let mut total = 1usize; - for d in self.dimension.iter() { - total = total.checked_mul(d.constraints.num_cells_per_tile()?)?; - } - Some(total) - } - - /// Returns the domains of each dimension as a `NonEmptyDomain`, - /// or `None` if any dimension is not constrained into a domain - pub fn domains(&self) -> Option { - self.dimension - .iter() - .map(|d| d.constraints.domain().map(Range::Single)) - .collect::>() - } -} - -#[cfg(any(test, feature = "proptest-strategies"))] -impl DomainData { - pub fn subarray_strategy( - &self, - ) -> impl proptest::prelude::Strategy> { - self.dimension - .iter() - .map(|d| d.subarray_strategy(None).unwrap()) - .collect::>>() - } -} - -impl Display for DomainData { - fn fmt(&self, f: &mut Formatter) -> FmtResult { - write!(f, "{}", json!(*self)) - } -} - -impl TryFrom<&Domain> for DomainData { - type Error = crate::error::Error; - - fn try_from(domain: &Domain) -> TileDBResult { - Ok(DomainData { - dimension: (0..domain.num_dimensions()?) - .map(|d| DimensionData::try_from(&domain.dimension(d)?)) - .collect::>>()?, - }) - } -} - -impl TryFrom for DomainData { - type Error = crate::error::Error; - - fn try_from(domain: Domain) -> TileDBResult { - Self::try_from(&domain) - } -} - -impl Factory for DomainData { - type Item = Domain; - - fn create(&self, context: &Context) -> TileDBResult { - Ok(self - .dimension - .iter() - .try_fold(Builder::new(context)?, |b, d| { - b.add_dimension(d.create(context)?) - })? - .build()) - } -} - -#[cfg(any(test, feature = "proptest-strategies"))] -pub mod strategy; +#[cfg(any(test, feature = "pod"))] +pub mod pod; #[cfg(test)] mod tests { use proptest::prelude::*; - use tiledb_utils::assert_option_subset; + use tiledb_pod::array::dimension::DimensionData; + use tiledb_pod::array::domain::DomainData; + use utils::assert_option_subset; use crate::array::domain::Builder; use crate::array::*; - use crate::{Datatype, Factory}; + use crate::{Context, Datatype, Factory}; #[test] fn test_add_dimension() { @@ -567,6 +474,32 @@ mod tests { assert_ne!(domain_d1_int32, domain_d1_float64); } + /// Test that the arbitrary domain construction always succeeds + #[test] + fn domain_arbitrary() { + let ctx = Context::new().expect("Error creating context"); + + proptest!(|(maybe_domain in any::())| { + maybe_domain.create(&ctx) + .expect("Error constructing arbitrary domain"); + }); + } + + #[test] + fn domain_eq_reflexivity() { + let ctx = Context::new().expect("Error creating context"); + + proptest!(|(domain in any::())| { + assert_eq!(domain, domain); + assert_option_subset!(domain, domain); + + let domain = domain.create(&ctx) + .expect("Error constructing arbitrary domain"); + assert_eq!(domain, domain); + }); + } + + /// Test iteration over [Domain] dimensions fn do_test_dimensions_iter(spec: DomainData) -> TileDBResult<()> { let context = Context::new()?; let domain = spec.create(&context)?; diff --git a/tiledb/api/src/array/domain/pod.rs b/tiledb/api/src/array/domain/pod.rs new file mode 100644 index 00000000..6b575679 --- /dev/null +++ b/tiledb/api/src/array/domain/pod.rs @@ -0,0 +1,40 @@ +use tiledb_pod::array::dimension::DimensionData; +use tiledb_pod::array::domain::DomainData; + +use super::{Builder, Domain}; +use crate::error::Error as TileDBError; +use crate::{Context, Factory, Result as TileDBResult}; + +impl TryFrom<&Domain> for DomainData { + type Error = TileDBError; + + fn try_from(domain: &Domain) -> Result { + Ok(DomainData { + dimension: (0..domain.num_dimensions()?) + .map(|d| DimensionData::try_from(&domain.dimension(d)?)) + .collect::>>()?, + }) + } +} + +impl TryFrom for DomainData { + type Error = TileDBError; + + fn try_from(domain: Domain) -> Result { + Self::try_from(&domain) + } +} + +impl Factory for DomainData { + type Item = Domain; + + fn create(&self, context: &Context) -> TileDBResult { + Ok(self + .dimension + .iter() + .try_fold(Builder::new(context)?, |b, d| { + b.add_dimension(d.create(context)?) + })? + .build()) + } +} diff --git a/tiledb/api/src/array/enumeration/mod.rs b/tiledb/api/src/array/enumeration/mod.rs index 19a72f1c..90773eff 100644 --- a/tiledb/api/src/array/enumeration/mod.rs +++ b/tiledb/api/src/array/enumeration/mod.rs @@ -1,17 +1,11 @@ -use std::fmt::{self, Debug, Formatter, Result as FmtResult}; use std::ops::Deref; -use serde::{Deserialize, Serialize}; -use serde_json::json; - -use util::option::OptionSubset; +#[cfg(any(test, feature = "pod"))] +use std::fmt::{Debug, Formatter, Result as FmtResult}; use crate::context::{CApiInterface, Context, ContextBound}; use crate::string::{RawTDBString, TDBString}; -use crate::{Datatype, Factory, Result as TileDBResult}; - -#[cfg(any(test, feature = "proptest-strategies"))] -pub mod strategy; +use crate::{Datatype, Result as TileDBResult}; pub(crate) enum RawEnumeration { Owned(*mut ffi::tiledb_enumeration_t), @@ -74,7 +68,7 @@ impl Enumeration { ffi::tiledb_enumeration_get_type(ctx, c_enmr, &mut dtype) })?; - Datatype::try_from(dtype) + Ok(Datatype::try_from(dtype)?) } pub fn cell_val_num(&self) -> TileDBResult { @@ -197,26 +191,6 @@ impl Enumeration { } } -impl Debug for Enumeration { - fn fmt(&self, f: &mut Formatter) -> FmtResult { - let name = self.name().map_err(|_| fmt::Error)?; - let dtype = self.datatype().map_err(|_| fmt::Error)?; - - let dtype_string = dtype.to_string(); - let cell_val_num = self.cell_val_num().map_err(|_| fmt::Error)?; - let ordered = self.ordered().map_err(|_| fmt::Error)?; - - let json = json!({ - "name": name, - "datatype": dtype_string, - "cell_val_num": cell_val_num, - "ordered": ordered, - "values": [], // TODO: Render values - }); - write!(f, "{}", json) - } -} - impl PartialEq for Enumeration { fn eq(&self, other: &Enumeration) -> bool { eq_helper!(self.name(), other.name()); @@ -240,6 +214,23 @@ impl PartialEq for Enumeration { } } +#[cfg(any(test, feature = "pod"))] +impl Debug for Enumeration { + fn fmt(&self, f: &mut Formatter) -> FmtResult { + match tiledb_pod::array::enumeration::EnumerationData::try_from(self) { + Ok(e) => Debug::fmt(&e, f), + Err(e) => { + let RawEnumeration::Owned(ptr) = self.raw; + write!( + f, + "", + ptr, e + ) + } + } + } +} + pub struct Builder<'data, 'offsets> { context: Context, name: String, @@ -316,7 +307,7 @@ impl<'data, 'offsets> Builder<'data, 'offsets> { let mut c_enmr: *mut ffi::tiledb_enumeration_t = out_ptr!(); let name_bytes = self.name.as_bytes(); let c_name = cstring!(name_bytes); - let c_dtype = self.dtype.capi_enum(); + let c_dtype = ffi::tiledb_datatype_t::from(self.dtype); // Rust semantics require that slice pointers aren't nullptr so that // nullptr can be used to distinguish between Some and None. The stdlib @@ -356,64 +347,16 @@ impl<'data, 'offsets> Builder<'data, 'offsets> { } } -/// Encapsulation of data needed to construct an Enumeration -#[derive(Clone, Debug, Deserialize, Serialize, PartialEq, OptionSubset)] -pub struct EnumerationData { - pub name: String, - pub datatype: Datatype, - pub cell_val_num: Option, - pub ordered: Option, - pub data: Box<[u8]>, - pub offsets: Option>, -} - -impl TryFrom<&Enumeration> for EnumerationData { - type Error = crate::error::Error; - - fn try_from(enmr: &Enumeration) -> TileDBResult { - let datatype = enmr.datatype()?; - let cell_val_num = enmr.cell_val_num()?; - let data = Box::from(enmr.data()?); - let offsets: Option> = enmr.offsets()?.map(Box::from); - - Ok(EnumerationData { - name: enmr.name()?, - datatype, - cell_val_num: Some(cell_val_num), - ordered: Some(enmr.ordered()?), - data, - offsets, - }) - } -} - -impl Factory for EnumerationData { - type Item = Enumeration; - - fn create(&self, context: &Context) -> TileDBResult { - let mut b = Builder::new( - context, - &self.name, - self.datatype, - &self.data[..], - self.offsets.as_ref().map(|o| &o[..]), - ); - - if let Some(cvn) = self.cell_val_num { - b = b.cell_val_num(cvn); - } - - if let Some(ordered) = self.ordered { - b = b.ordered(ordered); - } - - b.build() - } -} +#[cfg(any(test, feature = "pod"))] +pub mod pod; #[cfg(test)] mod tests { + use proptest::prelude::*; + use tiledb_pod::array::enumeration::EnumerationData; + use super::*; + use crate::{Context, Factory}; #[test] fn basic_build() -> TileDBResult<()> { @@ -598,4 +541,25 @@ mod tests { Ok(()) } + + /// Test that the arbitrary enumeration construction always succeeds + #[test] + fn enumeration_arbitrary() { + let ctx = Context::new().expect("Error creating context"); + + proptest!(|(enmr in any::())| { + enmr.create(&ctx).expect("Error constructing arbitrary enumeration"); + }); + } + + #[test] + fn enumeration_eq_reflexivity() { + let ctx = Context::new().expect("Error creating context"); + + proptest!(|(enmr in any::())| { + let enmr = enmr.create(&ctx) + .expect("Error constructing arbitrary enumeration"); + assert_eq!(enmr, enmr); + }); + } } diff --git a/tiledb/api/src/array/enumeration/pod.rs b/tiledb/api/src/array/enumeration/pod.rs new file mode 100644 index 00000000..ba0625a6 --- /dev/null +++ b/tiledb/api/src/array/enumeration/pod.rs @@ -0,0 +1,57 @@ +use tiledb_pod::array::enumeration::EnumerationData; + +use super::{Builder, Enumeration}; +use crate::error::Error as TileDBError; +use crate::{Context, Factory, Result as TileDBResult}; + +impl TryFrom<&Enumeration> for EnumerationData { + type Error = TileDBError; + + fn try_from(enmr: &Enumeration) -> Result { + let datatype = enmr.datatype()?; + let cell_val_num = enmr.cell_val_num()?; + let data = Box::from(enmr.data()?); + let offsets: Option> = enmr.offsets()?.map(Box::from); + + Ok(EnumerationData { + name: enmr.name()?, + datatype, + cell_val_num: Some(cell_val_num), + ordered: Some(enmr.ordered()?), + data, + offsets, + }) + } +} + +impl TryFrom for EnumerationData { + type Error = TileDBError; + + fn try_from(enmr: Enumeration) -> Result { + Self::try_from(&enmr) + } +} + +impl Factory for EnumerationData { + type Item = Enumeration; + + fn create(&self, context: &Context) -> TileDBResult { + let mut b = Builder::new( + context, + &self.name, + self.datatype, + &self.data[..], + self.offsets.as_ref().map(|o| &o[..]), + ); + + if let Some(cvn) = self.cell_val_num { + b = b.cell_val_num(cvn); + } + + if let Some(ordered) = self.ordered { + b = b.ordered(ordered); + } + + b.build() + } +} diff --git a/tiledb/api/src/array/fragment_info.rs b/tiledb/api/src/array/fragment_info.rs index cfcb0601..883bac7c 100644 --- a/tiledb/api/src/array/fragment_info.rs +++ b/tiledb/api/src/array/fragment_info.rs @@ -7,7 +7,7 @@ use crate::array::schema::{RawSchema, Schema}; use crate::config::{Config, RawConfig}; use crate::context::{CApiInterface, Context, ContextBound}; use crate::datatype::Datatype; -use crate::error::{DatatypeErrorKind, Error}; +use crate::error::{DatatypeError, Error}; use crate::physical_type_go; use crate::range::{ MinimumBoundingRectangle, Range, TypedNonEmptyDomain, TypedRange, @@ -346,17 +346,15 @@ impl FragmentInfoInternal { physical_type_go!(datatype, DT, { if start_size % std::mem::size_of::
() as u64 != 0 { - return Err(Error::Datatype(DatatypeErrorKind::TypeMismatch { - user_type: std::any::type_name::
().to_owned(), - tiledb_type: datatype, - })); + return Err(Error::Datatype( + DatatypeError::physical_type_incompatible::
(datatype), + )); } if end_size % std::mem::size_of::
() as u64 != 0 { - return Err(Error::Datatype(DatatypeErrorKind::TypeMismatch { - user_type: std::any::type_name::
().to_owned(), - tiledb_type: datatype, - })); + return Err(Error::Datatype( + DatatypeError::physical_type_incompatible::
(datatype), + )); } let start_elems = start_size / std::mem::size_of::
() as u64; @@ -463,17 +461,15 @@ impl FragmentInfoInternal { physical_type_go!(datatype, DT, { if start_size % std::mem::size_of::
() as u64 != 0 { - return Err(Error::Datatype(DatatypeErrorKind::TypeMismatch { - user_type: std::any::type_name::
().to_owned(), - tiledb_type: datatype, - })); + return Err(Error::Datatype( + DatatypeError::physical_type_incompatible::
(datatype), + )); } if end_size % std::mem::size_of::
() as u64 != 0 { - return Err(Error::Datatype(DatatypeErrorKind::TypeMismatch { - user_type: std::any::type_name::
().to_owned(), - tiledb_type: datatype, - })); + return Err(Error::Datatype( + DatatypeError::physical_type_incompatible::
(datatype), + )); } let start_elems = start_size / std::mem::size_of::
() as u64; @@ -744,7 +740,7 @@ impl Builder { #[cfg(test)] pub mod tests { - use tiledb_test_utils::{self, TestArrayUri}; + use uri::{self, TestArrayUri}; use super::*; use crate::array::*; @@ -755,7 +751,7 @@ pub mod tests { #[test] fn test_set_config() -> TileDBResult<()> { let ctx = Context::new().unwrap(); - let test_uri = tiledb_test_utils::get_uri_generator() + let test_uri = uri::get_uri_generator() .map_err(|e| Error::Other(e.to_string()))?; let array_uri = create_dense_array(&ctx, &test_uri)?; @@ -771,7 +767,7 @@ pub mod tests { #[test] fn test_get_config() -> TileDBResult<()> { let ctx = Context::new().unwrap(); - let test_uri = tiledb_test_utils::get_uri_generator() + let test_uri = uri::get_uri_generator() .map_err(|e| Error::Other(e.to_string()))?; let array_uri = create_dense_array(&ctx, &test_uri)?; let frag_infos = Builder::new(&ctx, array_uri)?.build()?; @@ -784,7 +780,7 @@ pub mod tests { #[test] fn test_load_infos() -> TileDBResult<()> { let ctx = Context::new().unwrap(); - let test_uri = tiledb_test_utils::get_uri_generator() + let test_uri = uri::get_uri_generator() .map_err(|e| Error::Other(e.to_string()))?; let array_uri = create_dense_array(&ctx, &test_uri)?; let frag_infos = Builder::new(&ctx, array_uri)?.build_without_loading(); @@ -797,7 +793,7 @@ pub mod tests { #[test] fn test_unconsolidated_metadata_num() -> TileDBResult<()> { let ctx = Context::new().unwrap(); - let test_uri = tiledb_test_utils::get_uri_generator() + let test_uri = uri::get_uri_generator() .map_err(|e| Error::Other(e.to_string()))?; let array_uri = create_dense_array(&ctx, &test_uri)?; let frag_infos = Builder::new(&ctx, array_uri)?.build()?; @@ -810,7 +806,7 @@ pub mod tests { #[test] fn test_num_to_vacuum() -> TileDBResult<()> { let ctx = Context::new().unwrap(); - let test_uri = tiledb_test_utils::get_uri_generator() + let test_uri = uri::get_uri_generator() .map_err(|e| Error::Other(e.to_string()))?; let array_uri = create_dense_array(&ctx, &test_uri)?; let frag_infos = Builder::new(&ctx, array_uri)?.build()?; @@ -823,7 +819,7 @@ pub mod tests { #[test] fn test_total_cell_count() -> TileDBResult<()> { let ctx = Context::new().unwrap(); - let test_uri = tiledb_test_utils::get_uri_generator() + let test_uri = uri::get_uri_generator() .map_err(|e| Error::Other(e.to_string()))?; let array_uri = create_dense_array(&ctx, &test_uri)?; let frag_infos = Builder::new(&ctx, array_uri)?.build()?; @@ -837,7 +833,7 @@ pub mod tests { #[test] fn test_num_fragments() -> TileDBResult<()> { let ctx = Context::new().unwrap(); - let test_uri = tiledb_test_utils::get_uri_generator() + let test_uri = uri::get_uri_generator() .map_err(|e| Error::Other(e.to_string()))?; let array_uri = create_dense_array(&ctx, &test_uri)?; let frag_infos = Builder::new(&ctx, array_uri)?.build()?; @@ -851,7 +847,7 @@ pub mod tests { #[test] fn test_get_fragment() -> TileDBResult<()> { let ctx = Context::new().unwrap(); - let test_uri = tiledb_test_utils::get_uri_generator() + let test_uri = uri::get_uri_generator() .map_err(|e| Error::Other(e.to_string()))?; let array_uri = create_dense_array(&ctx, &test_uri)?; let frag_infos = Builder::new(&ctx, array_uri)?.build()?; @@ -864,7 +860,7 @@ pub mod tests { #[test] fn test_get_fragment_failure() -> TileDBResult<()> { let ctx = Context::new().unwrap(); - let test_uri = tiledb_test_utils::get_uri_generator() + let test_uri = uri::get_uri_generator() .map_err(|e| Error::Other(e.to_string()))?; let array_uri = create_dense_array(&ctx, &test_uri)?; let frag_infos = Builder::new(&ctx, array_uri)?.build()?; @@ -877,7 +873,7 @@ pub mod tests { #[test] fn test_iter_fragments() -> TileDBResult<()> { let ctx = Context::new().unwrap(); - let test_uri = tiledb_test_utils::get_uri_generator() + let test_uri = uri::get_uri_generator() .map_err(|e| Error::Other(e.to_string()))?; let array_uri = create_dense_array(&ctx, &test_uri)?; let frag_infos = Builder::new(&ctx, array_uri)?.build()?; @@ -895,7 +891,7 @@ pub mod tests { #[test] fn test_fragment_info_apis() -> TileDBResult<()> { let ctx = Context::new().unwrap(); - let test_uri = tiledb_test_utils::get_uri_generator() + let test_uri = uri::get_uri_generator() .map_err(|e| Error::Other(e.to_string()))?; let dense_array_uri = create_dense_array(&ctx, &test_uri)?; let sparse_array_uri = create_sparse_array(&ctx, &test_uri)?; diff --git a/tiledb/api/src/array/mod.rs b/tiledb/api/src/array/mod.rs index 610850dd..e35829d9 100644 --- a/tiledb/api/src/array/mod.rs +++ b/tiledb/api/src/array/mod.rs @@ -5,15 +5,14 @@ use std::ops::Deref; use std::str::FromStr; use anyhow::anyhow; -use serde::{Deserialize, Serialize}; -use util::option::OptionSubset; use crate::array::enumeration::RawEnumeration; use crate::array::schema::RawSchema; use crate::context::{CApiInterface, Context, ContextBound}; use crate::datatype::PhysicalType; -use crate::error::{DatatypeErrorKind, Error, ModeErrorKind}; +use crate::error::{DatatypeError, Error}; use crate::key::LookupKey; +use crate::metadata; use crate::metadata::Metadata; use crate::range::{ Range, SingleValueRange, TypedNonEmptyDomain, TypedRange, VarValueRange, @@ -29,142 +28,21 @@ pub mod fragment_info; pub mod schema; use crate::config::Config; -pub use attribute::{Attribute, AttributeData, Builder as AttributeBuilder}; + +pub use attribute::{Attribute, Builder as AttributeBuilder}; pub use dimension::{ - Builder as DimensionBuilder, Dimension, DimensionConstraints, DimensionData, -}; -pub use domain::{Builder as DomainBuilder, Domain, DomainData}; -pub use enumeration::{ - Builder as EnumerationBuilder, Enumeration, EnumerationData, + Builder as DimensionBuilder, Dimension, DimensionConstraints, }; +pub use domain::{Builder as DomainBuilder, Domain}; +pub use enumeration::{Builder as EnumerationBuilder, Enumeration}; use ffi::tiledb_config_t; pub use fragment_info::{ Builder as FragmentInfoBuilder, FragmentInfo, FragmentInfoList, }; pub use schema::{ - ArrayType, Builder as SchemaBuilder, CellValNum, Field, Schema, SchemaData, + ArrayType, Builder as SchemaBuilder, CellValNum, Field, Schema, }; - -#[derive(Clone, Debug, PartialEq)] -pub enum Mode { - Read, - Write, - Delete, - Update, - ModifyExclusive, -} - -impl Mode { - pub(crate) fn capi_enum(&self) -> ffi::tiledb_query_type_t { - match *self { - Mode::Read => ffi::tiledb_query_type_t_TILEDB_READ, - Mode::Write => ffi::tiledb_query_type_t_TILEDB_WRITE, - Mode::Delete => ffi::tiledb_query_type_t_TILEDB_DELETE, - Mode::Update => ffi::tiledb_query_type_t_TILEDB_UPDATE, - Mode::ModifyExclusive => { - ffi::tiledb_query_type_t_TILEDB_MODIFY_EXCLUSIVE - } - } - } -} - -impl TryFrom for Mode { - type Error = crate::error::Error; - - fn try_from(value: ffi::tiledb_query_type_t) -> TileDBResult { - Ok(match value { - ffi::tiledb_query_type_t_TILEDB_READ => Mode::Read, - ffi::tiledb_query_type_t_TILEDB_WRITE => Mode::Write, - ffi::tiledb_query_type_t_TILEDB_DELETE => Mode::Delete, - ffi::tiledb_query_type_t_TILEDB_UPDATE => Mode::Update, - ffi::tiledb_query_type_t_TILEDB_MODIFY_EXCLUSIVE => { - Mode::ModifyExclusive - } - _ => { - return Err(Error::ModeType( - ModeErrorKind::InvalidDiscriminant(value as u64), - )) - } - }) - } -} - -impl Display for Mode { - fn fmt(&self, f: &mut Formatter) -> FmtResult { - ::fmt(self, f) - } -} - -#[derive( - Clone, Copy, Debug, Deserialize, Eq, OptionSubset, PartialEq, Serialize, -)] -pub enum TileOrder { - RowMajor, - ColumnMajor, -} - -impl TileOrder { - pub(crate) fn capi_enum(&self) -> ffi::tiledb_layout_t { - match *self { - TileOrder::RowMajor => ffi::tiledb_layout_t_TILEDB_ROW_MAJOR, - TileOrder::ColumnMajor => ffi::tiledb_layout_t_TILEDB_COL_MAJOR, - } - } -} - -impl TryFrom for TileOrder { - type Error = crate::error::Error; - fn try_from(value: ffi::tiledb_layout_t) -> TileDBResult { - match value { - ffi::tiledb_layout_t_TILEDB_ROW_MAJOR => Ok(TileOrder::RowMajor), - ffi::tiledb_layout_t_TILEDB_COL_MAJOR => Ok(TileOrder::ColumnMajor), - _ => Err(Self::Error::LibTileDB(format!( - "Invalid tile order: {}", - value - ))), - } - } -} - -#[derive( - Clone, Copy, Debug, Deserialize, Eq, OptionSubset, PartialEq, Serialize, -)] -pub enum CellOrder { - Unordered, - RowMajor, - ColumnMajor, - Global, - Hilbert, -} - -impl CellOrder { - pub(crate) fn capi_enum(&self) -> ffi::tiledb_layout_t { - match *self { - CellOrder::Unordered => ffi::tiledb_layout_t_TILEDB_UNORDERED, - CellOrder::RowMajor => ffi::tiledb_layout_t_TILEDB_ROW_MAJOR, - CellOrder::ColumnMajor => ffi::tiledb_layout_t_TILEDB_COL_MAJOR, - CellOrder::Global => ffi::tiledb_layout_t_TILEDB_GLOBAL_ORDER, - CellOrder::Hilbert => ffi::tiledb_layout_t_TILEDB_HILBERT, - } - } -} - -impl TryFrom for CellOrder { - type Error = crate::error::Error; - fn try_from(value: ffi::tiledb_layout_t) -> TileDBResult { - match value { - ffi::tiledb_layout_t_TILEDB_UNORDERED => Ok(CellOrder::Unordered), - ffi::tiledb_layout_t_TILEDB_ROW_MAJOR => Ok(CellOrder::RowMajor), - ffi::tiledb_layout_t_TILEDB_COL_MAJOR => Ok(CellOrder::ColumnMajor), - ffi::tiledb_layout_t_TILEDB_GLOBAL_ORDER => Ok(CellOrder::Global), - ffi::tiledb_layout_t_TILEDB_HILBERT => Ok(CellOrder::Hilbert), - _ => Err(Self::Error::LibTileDB(format!( - "Invalid cell order: {}", - value - ))), - } - } -} +pub use tiledb_common::array::{CellOrder, Mode, TileOrder}; /// Method of encryption. #[derive(Clone, Copy, Debug, Eq, PartialEq)] @@ -173,21 +51,9 @@ pub enum Encryption { Aes256Gcm, } -impl Encryption { - /// Returns the corresponding C API constant. - pub(crate) fn capi_enum(&self) -> ffi::tiledb_encryption_type_t { - match *self { - Self::Unencrypted => { - ffi::tiledb_encryption_type_t_TILEDB_NO_ENCRYPTION - } - Self::Aes256Gcm => ffi::tiledb_encryption_type_t_TILEDB_AES_256_GCM, - } - } -} - impl Display for Encryption { fn fmt(&self, f: &mut Formatter) -> FmtResult { - let c_encryption = self.capi_enum(); + let c_encryption = ffi::tiledb_encryption_type_t::from(*self); let mut c_str = out_ptr!(); let c_ret = unsafe { @@ -230,6 +96,19 @@ impl FromStr for Encryption { } } +impl From for ffi::tiledb_encryption_type_t { + fn from(value: Encryption) -> Self { + match value { + Encryption::Unencrypted => { + ffi::tiledb_encryption_type_t_TILEDB_NO_ENCRYPTION + } + Encryption::Aes256Gcm => { + ffi::tiledb_encryption_type_t_TILEDB_AES_256_GCM + } + } + } +} + impl TryFrom for Encryption { type Error = crate::error::Error; @@ -381,7 +260,8 @@ impl Array { pub fn put_metadata(&mut self, metadata: Metadata) -> TileDBResult<()> { let c_array = *self.raw; - let (vec_size, vec_ptr, datatype) = metadata.c_data(); + let (vec_size, vec_ptr, datatype) = + metadata::metadata_to_ffi(&metadata); let c_key = cstring!(metadata.key); self.capi_call(|ctx| unsafe { ffi::tiledb_array_put_metadata( @@ -389,7 +269,7 @@ impl Array { c_array, c_key.as_ptr(), datatype, - vec_size as u32, + vec_size, vec_ptr, ) })?; @@ -461,7 +341,11 @@ impl Array { } }?; let datatype = Datatype::try_from(c_datatype)?; - Ok(Metadata::new_raw(name, datatype, vec_ptr, vec_size)) + Ok(metadata::metadata_from_ffi( + name, + datatype, + (vec_size, vec_ptr), + )) } pub fn has_metadata_key(&self, name: S) -> TileDBResult> @@ -729,20 +613,18 @@ impl Array { } if start_size % std::mem::size_of::
() as u64 != 0 { - return Err(Error::Datatype( - DatatypeErrorKind::TypeMismatch { - user_type: std::any::type_name::
().to_owned(), - tiledb_type: datatype, - }, + return Err(Error::from( + DatatypeError::physical_type_incompatible::
( + datatype, + ), )); } if end_size % std::mem::size_of::
() as u64 != 0 { - return Err(Error::Datatype( - DatatypeErrorKind::TypeMismatch { - user_type: std::any::type_name::
().to_owned(), - tiledb_type: datatype, - }, + return Err(Error::from( + DatatypeError::physical_type_incompatible::
( + datatype, + ), )); } @@ -794,20 +676,18 @@ impl Array { } if start_size % std::mem::size_of::
() as u64 != 0 { - return Err(Error::Datatype( - DatatypeErrorKind::TypeMismatch { - user_type: std::any::type_name::
().to_owned(), - tiledb_type: datatype, - }, + return Err(Error::from( + DatatypeError::physical_type_incompatible::
( + datatype, + ), )); } if end_size % std::mem::size_of::
() as u64 != 0 { - return Err(Error::Datatype( - DatatypeErrorKind::TypeMismatch { - user_type: std::any::type_name::
().to_owned(), - tiledb_type: datatype, - }, + return Err(Error::from( + DatatypeError::physical_type_incompatible::
( + datatype, + ), )); } @@ -1008,7 +888,7 @@ impl ArrayOpener { let c_array = *self.array.raw; if let Some(mode) = self.mode { - let c_mode = mode.capi_enum(); + let c_mode = ffi::tiledb_query_type_t::from(mode); self.array.capi_call(|ctx| unsafe { ffi::tiledb_array_open(ctx, c_array, c_mode) })?; @@ -1022,21 +902,24 @@ impl ArrayOpener { } } -#[cfg(any(test, feature = "proptest-strategies"))] -pub mod strategy; - #[cfg(test)] pub mod tests { - use tiledb_test_utils::{self, TestArrayUri}; + use proptest::prelude::*; + use tiledb_common::array::dimension::DimensionConstraints; + use tiledb_common::datatype::Datatype; + use tiledb_common::metadata::Value; + use tiledb_pod::array::enumeration::EnumerationData; + use tiledb_pod::array::schema::SchemaData; + use uri::{self, TestArrayUri}; + use utils::assert_option_subset; + use utils::option::OptionSubset; use super::*; - use crate::array::dimension::DimensionConstraints; use crate::config::CommonOption; - use crate::metadata::Value; use crate::query::{ Query, QueryBuilder, QueryLayout, QueryType, WriteBuilder, }; - use crate::{Datatype, Factory}; + use crate::Factory; /// Create the array used in the "quickstart_dense" example pub fn create_quickstart_dense( @@ -1110,7 +993,7 @@ pub mod tests { #[test] fn test_array_create() -> TileDBResult<()> { - let test_uri = tiledb_test_utils::get_uri_generator() + let test_uri = uri::get_uri_generator() .map_err(|e| Error::Other(e.to_string()))?; let c: Context = Context::new().unwrap(); @@ -1124,9 +1007,30 @@ pub mod tests { Ok(()) } + #[test] + fn proptest_array_create() { + let ctx = Context::new().expect("Error creating context"); + + proptest!(|(schema_spec in any::())| { + let schema_in = schema_spec.create(&ctx) + .expect("Error constructing arbitrary schema"); + + let test_uri = uri::get_uri_generator().map_err(|e| Error::Other(e.to_string()))?; + let uri = test_uri.with_path("array").map_err(|e| Error::Other(e.to_string()))?; + + Array::create(&ctx, &uri, schema_in) + .expect("Error creating array"); + + let schema_out = Schema::load(&ctx, &uri).expect("Error loading array schema"); + + let schema_out_spec = SchemaData::try_from(&schema_out).expect("Error creating schema spec"); + assert_option_subset!(schema_spec, schema_out_spec); + }) + } + #[test] fn test_array_metadata() -> TileDBResult<()> { - let test_uri = tiledb_test_utils::get_uri_generator() + let test_uri = uri::get_uri_generator() .map_err(|e| Error::Other(e.to_string()))?; let tdb = Context::new()?; @@ -1194,7 +1098,7 @@ pub mod tests { #[test] fn test_mode_metadata() -> TileDBResult<()> { - let test_uri = tiledb_test_utils::get_uri_generator() + let test_uri = uri::get_uri_generator() .map_err(|e| Error::Other(e.to_string()))?; let tdb = Context::new()?; @@ -1250,6 +1154,29 @@ pub mod tests { test_uri.close().map_err(|e| Error::Other(e.to_string())) } + #[test] + fn arbitrary_metadata() { + let test_uri = uri::get_uri_generator().unwrap(); + let uri = test_uri.with_path("quickstart_dense").unwrap(); + + let c: Context = Context::new().unwrap(); + create_quickstart_dense(&test_uri, &c).unwrap(); + + proptest!(move |(m_in in any::())| { + // write + { + let mut a = Array::open(&c, &uri, Mode::Write).unwrap(); + a.put_metadata(m_in.clone()).expect("Error writing metadata"); + } + // read + { + let a = Array::open(&c, &uri, Mode::Read).unwrap(); + let m_out = a.metadata(m_in.key.clone()).expect("Error reading metadata"); + assert_eq!(m_in, m_out); + } + }); + } + fn create_simple_dense( test_uri: &dyn TestArrayUri, ctx: &Context, @@ -1316,7 +1243,7 @@ pub mod tests { // Test advanced consolidation. Based on unit-capi-consolidation.cc. let ctx: Context = Context::new().unwrap(); - let array_uri = tiledb_test_utils::get_uri_generator().unwrap(); + let array_uri = uri::get_uri_generator().unwrap(); let array_uri = create_simple_dense(&array_uri, &ctx)?; write_dense_vector_4_fragments(&ctx, &array_uri, 0).unwrap(); @@ -1359,7 +1286,7 @@ pub mod tests { #[test] fn delete() -> TileDBResult<()> { - let test_uri = tiledb_test_utils::get_uri_generator() + let test_uri = uri::get_uri_generator() .map_err(|e| Error::Other(e.to_string()))?; let c: Context = Context::new().unwrap(); @@ -1380,7 +1307,7 @@ pub mod tests { #[test] fn create_enumeration() -> TileDBResult<()> { - let test_uri = tiledb_test_utils::get_uri_generator() + let test_uri = uri::get_uri_generator() .map_err(|e| Error::Other(e.to_string()))?; let uri = test_uri @@ -1482,17 +1409,23 @@ pub mod tests { fn encryption_type_capi() { assert_eq!( Encryption::Unencrypted, - Encryption::try_from(Encryption::Unencrypted.capi_enum()).unwrap() + Encryption::try_from(ffi::tiledb_encryption_type_t::from( + Encryption::Unencrypted + )) + .unwrap() ); assert_eq!( Encryption::Aes256Gcm, - Encryption::try_from(Encryption::Aes256Gcm.capi_enum()).unwrap() + Encryption::try_from(ffi::tiledb_encryption_type_t::from( + Encryption::Aes256Gcm + )) + .unwrap() ); } #[test] fn encrypted_array() -> TileDBResult<()> { - let test_uri = tiledb_test_utils::get_uri_generator() + let test_uri = uri::get_uri_generator() .map_err(|e| Error::Other(e.to_string()))?; let key = "0123456789abcdeF0123456789abcdeF"; diff --git a/tiledb/api/src/array/schema/arrow.rs b/tiledb/api/src/array/schema/arrow.rs index 03e8ee07..7ca66a4e 100644 --- a/tiledb/api/src/array/schema/arrow.rs +++ b/tiledb/api/src/array/schema/arrow.rs @@ -256,11 +256,14 @@ pub fn attributes(schema: &ArrowSchema) -> TileDBResult<&[Arc]> { mod tests { use std::collections::HashSet; + use proptest::prelude::*; + use tiledb_pod::array::attribute::AttributeData; + use tiledb_pod::array::dimension::DimensionData; + use tiledb_pod::array::schema::SchemaData; + use super::*; - use crate::array::schema::{Field as SchemaField, SchemaData}; - use crate::array::{AttributeData, DimensionData}; + use crate::array::schema::Field as SchemaField; use crate::Factory; - use proptest::prelude::*; fn do_to_arrow(tdb_in: SchemaData) { let c: Context = Context::new().unwrap(); diff --git a/tiledb/api/src/array/schema/mod.rs b/tiledb/api/src/array/schema/mod.rs index 87a49962..1c25fd01 100644 --- a/tiledb/api/src/array/schema/mod.rs +++ b/tiledb/api/src/array/schema/mod.rs @@ -1,153 +1,26 @@ use std::borrow::Borrow; -use std::fmt::{Debug, Display, Formatter, Result as FmtResult}; -use std::num::{NonZeroU32, NonZeroUsize}; +use std::num::NonZeroUsize; use std::ops::Deref; +#[cfg(any(test, feature = "pod"))] +use std::fmt::{Debug, Formatter, Result as FmtResult}; + use anyhow::anyhow; -use serde::{Deserialize, Serialize}; -use serde_json::json; -use util::option::OptionSubset; -use crate::array::attribute::{AttributeData, RawAttribute}; -use crate::array::dimension::{Dimension, DimensionData}; -use crate::array::domain::{DomainData, RawDomain}; +use crate::array::attribute::RawAttribute; +use crate::array::dimension::Dimension; +use crate::array::domain::RawDomain; use crate::array::enumeration::Enumeration; use crate::array::{Attribute, CellOrder, Domain, TileOrder}; use crate::context::{CApiInterface, Context, ContextBound}; use crate::error::Error; -use crate::filter::list::{FilterList, FilterListData, RawFilterList}; +use crate::filter::list::{FilterList, RawFilterList}; use crate::key::LookupKey; use crate::query::read::output::FieldScratchAllocator; use crate::Datatype; -use crate::{Factory, Result as TileDBResult}; - -#[derive( - Clone, - Default, - Copy, - Debug, - Deserialize, - Eq, - OptionSubset, - PartialEq, - Serialize, -)] -pub enum ArrayType { - #[default] - Dense, - Sparse, -} - -impl ArrayType { - pub(crate) fn capi_enum(&self) -> ffi::tiledb_array_type_t { - match *self { - ArrayType::Dense => ffi::tiledb_array_type_t_TILEDB_DENSE, - ArrayType::Sparse => ffi::tiledb_array_type_t_TILEDB_SPARSE, - } - } -} - -impl TryFrom for ArrayType { - type Error = crate::error::Error; - fn try_from(value: ffi::tiledb_array_type_t) -> TileDBResult { - match value { - ffi::tiledb_array_type_t_TILEDB_DENSE => Ok(ArrayType::Dense), - ffi::tiledb_array_type_t_TILEDB_SPARSE => Ok(ArrayType::Sparse), - _ => Err(Self::Error::LibTileDB(format!( - "Invalid array type: {}", - value - ))), - } - } -} - -/// Represents the number of values carried within a single cell of an attribute or dimension. -#[derive( - Copy, Clone, Debug, Deserialize, Eq, OptionSubset, PartialEq, Serialize, -)] -pub enum CellValNum { - /// The number of values per cell is a specific fixed number. - Fixed(std::num::NonZeroU32), - /// The number of values per cell varies. - /// When this option is used for a dimension or attribute, queries must allocate additional - /// space to hold structural information about each cell. The values will be concatenated - /// together in a single buffer, and the structural data buffer contains the offset - /// of each record into the values buffer. - Var, -} - -impl CellValNum { - pub(crate) fn capi(&self) -> u32 { - match self { - CellValNum::Fixed(c) => c.get(), - CellValNum::Var => u32::MAX, - } - } +use crate::Result as TileDBResult; - pub fn single() -> Self { - CellValNum::Fixed(NonZeroU32::new(1).unwrap()) - } - - pub fn is_var_sized(&self) -> bool { - matches!(self, CellValNum::Var) - } - - pub fn is_single_valued(&self) -> bool { - matches!(self, CellValNum::Fixed(nz) if nz.get() == 1) - } - - /// Return the fixed number of values per cell, if not variable. - pub fn fixed(&self) -> Option { - if let CellValNum::Fixed(nz) = self { - Some(*nz) - } else { - None - } - } -} - -impl Default for CellValNum { - fn default() -> Self { - Self::single() - } -} - -impl Display for CellValNum { - fn fmt(&self, f: &mut Formatter) -> FmtResult { - ::fmt(self, f) - } -} - -impl PartialEq for CellValNum { - fn eq(&self, other: &u32) -> bool { - match self { - CellValNum::Fixed(val) => val.get() == *other, - CellValNum::Var => *other == u32::MAX, - } - } -} - -impl TryFrom for CellValNum { - type Error = crate::error::Error; - fn try_from(value: u32) -> TileDBResult { - match value { - 0 => Err(Error::InvalidArgument(anyhow!( - "Cell val num cannot be zero" - ))), - u32::MAX => Ok(CellValNum::Var), - v => Ok(CellValNum::Fixed(NonZeroU32::new(v).unwrap())), - } - } -} - -impl From for u32 { - fn from(value: CellValNum) -> Self { - match value { - CellValNum::Fixed(nz) => nz.get(), - CellValNum::Var => u32::MAX, - } - } -} +pub use tiledb_common::array::{ArrayType, CellValNum}; /// Wrapper for the CAPI handle. /// Ensures that the CAPI structure is freed. @@ -221,127 +94,37 @@ impl Field { &self, memory_limit: Option, ) -> TileDBResult { - Ok(FieldData::try_from(self)?.query_scratch_allocator(memory_limit)) - } -} - -impl From for Field { - fn from(dim: Dimension) -> Field { - Field::Dimension(dim) - } -} - -impl From for Field { - fn from(attr: Attribute) -> Field { - Field::Attribute(attr) - } -} - -#[derive(Clone, Debug, Deserialize, OptionSubset, Serialize, PartialEq)] -pub enum FieldData { - Dimension(DimensionData), - Attribute(AttributeData), -} - -impl FieldData { - pub fn is_attribute(&self) -> bool { - matches!(self, Self::Attribute(_)) - } - - pub fn is_dimension(&self) -> bool { - matches!(self, Self::Dimension(_)) - } - - pub fn name(&self) -> &str { - match self { - Self::Dimension(d) => &d.name, - Self::Attribute(a) => &a.name, - } - } - - pub fn datatype(&self) -> Datatype { - match self { - Self::Dimension(d) => d.datatype, - Self::Attribute(a) => a.datatype, - } - } - - pub fn cell_val_num(&self) -> Option { - match self { - Self::Dimension(d) => Some(d.cell_val_num()), - Self::Attribute(a) => a.cell_val_num, - } - } - - pub fn nullability(&self) -> Option { - match self { - Self::Dimension(_) => Some(false), - Self::Attribute(a) => a.nullability, - } - } - - pub fn query_scratch_allocator( - &self, - memory_limit: Option, - ) -> crate::query::read::output::FieldScratchAllocator { /* * Allocate space for the largest integral number of cells * which fits within the memory limit. */ - let est_values_per_cell = match self.cell_val_num().unwrap_or_default() - { + let est_values_per_cell = match self.cell_val_num()? { CellValNum::Fixed(nz) => nz.get() as usize, CellValNum::Var => 64, }; - let est_cell_size = - est_values_per_cell * self.datatype().size() as usize; + let est_cell_size = est_values_per_cell * self.datatype()?.size(); let est_cell_capacity = memory_limit .unwrap_or(FieldScratchAllocator::DEFAULT_MEMORY_LIMIT) / est_cell_size; - FieldScratchAllocator { + Ok(FieldScratchAllocator { cell_val_num: self.cell_val_num().unwrap_or_default(), record_capacity: NonZeroUsize::new(est_cell_capacity).unwrap(), is_nullable: self.nullability().unwrap_or(true), - } - } -} - -impl Display for FieldData { - fn fmt(&self, f: &mut Formatter) -> FmtResult { - write!(f, "{}", json!(*self)) - } -} - -impl From for FieldData { - fn from(attr: AttributeData) -> Self { - FieldData::Attribute(attr) - } -} - -impl From for FieldData { - fn from(dim: DimensionData) -> Self { - FieldData::Dimension(dim) + }) } } -impl TryFrom<&Field> for FieldData { - type Error = crate::error::Error; - - fn try_from(field: &Field) -> TileDBResult { - match field { - Field::Dimension(d) => Ok(Self::from(DimensionData::try_from(d)?)), - Field::Attribute(a) => Ok(Self::from(AttributeData::try_from(a)?)), - } +impl From for Field { + fn from(dim: Dimension) -> Field { + Field::Dimension(dim) } } -impl TryFrom for FieldData { - type Error = crate::error::Error; - - fn try_from(field: Field) -> TileDBResult { - Self::try_from(&field) +impl From for Field { + fn from(attr: Attribute) -> Field { + Field::Attribute(attr) } } @@ -416,7 +199,7 @@ impl Schema { ffi::tiledb_array_schema_get_array_type(ctx, c_schema, &mut c_atype) })?; - ArrayType::try_from(c_atype) + Ok(ArrayType::try_from(c_atype)?) } /// Returns the sparse tile capacity for this schema, @@ -448,7 +231,7 @@ impl Schema { ) })?; - CellOrder::try_from(c_cell_order) + Ok(CellOrder::try_from(c_cell_order)?) } pub fn tile_order(&self) -> TileDBResult { @@ -462,7 +245,7 @@ impl Schema { ) })?; - TileOrder::try_from(c_tile_order) + Ok(TileOrder::try_from(c_tile_order)?) } /// Returns whether duplicate coordinates are permitted. @@ -597,17 +380,6 @@ impl Schema { } } -impl Debug for Schema { - fn fmt(&self, f: &mut Formatter) -> FmtResult { - let data = SchemaData::try_from(self).map_err(|_| std::fmt::Error)?; - let mut json = json!(data); - json["version"] = json!(self.version()); - json["raw"] = json!(format!("{:p}", *self.raw)); - - write!(f, "{}", json) - } -} - impl PartialEq for Schema { fn eq(&self, other: &Schema) -> bool { eq_helper!(self.num_attributes(), other.num_attributes()); @@ -631,6 +403,19 @@ impl PartialEq for Schema { } } +#[cfg(any(test, feature = "pod"))] +impl Debug for Schema { + fn fmt(&self, f: &mut Formatter) -> FmtResult { + match tiledb_pod::array::schema::SchemaData::try_from(self) { + Ok(s) => Debug::fmt(&s, f), + Err(e) => { + let RawSchema::Owned(ptr) = self.raw; + write!(f, "", ptr, e) + } + } + } +} + pub struct Fields<'a> { schema: &'a Schema, cursor: usize, @@ -694,7 +479,7 @@ impl Builder { array_type: ArrayType, domain: Domain, ) -> TileDBResult { - let c_array_type = array_type.capi_enum(); + let c_array_type = ffi::tiledb_array_type_t::from(array_type); let mut c_schema: *mut ffi::tiledb_array_schema_t = std::ptr::null_mut(); context.capi_call(|ctx| unsafe { @@ -730,7 +515,7 @@ impl Builder { pub fn cell_order(self, order: CellOrder) -> TileDBResult { let c_schema = *self.schema.raw; - let c_order = order.capi_enum(); + let c_order = ffi::tiledb_layout_t::from(order); self.capi_call(|ctx| unsafe { ffi::tiledb_array_schema_set_cell_order(ctx, c_schema, c_order) })?; @@ -739,7 +524,7 @@ impl Builder { pub fn tile_order(self, order: TileOrder) -> TileDBResult { let c_schema = *self.schema.raw; - let c_order = order.capi_enum(); + let c_order = ffi::tiledb_layout_t::from(order); self.capi_call(|ctx| unsafe { ffi::tiledb_array_schema_set_tile_order(ctx, c_schema, c_order) })?; @@ -846,196 +631,22 @@ impl TryFrom for Schema { } } -/// Encapsulation of data needed to construct a Schema -#[derive( - Clone, Default, Debug, Deserialize, OptionSubset, PartialEq, Serialize, -)] -pub struct SchemaData { - pub array_type: ArrayType, - pub domain: DomainData, - pub capacity: Option, - pub cell_order: Option, - pub tile_order: Option, - pub allow_duplicates: Option, - pub attributes: Vec, - pub coordinate_filters: FilterListData, - pub offsets_filters: FilterListData, - pub nullity_filters: FilterListData, -} - -impl SchemaData { - const DEFAULT_SPARSE_TILE_CAPACITY: u64 = 10000; - - pub fn num_fields(&self) -> usize { - self.domain.dimension.len() + self.attributes.len() - } - - pub fn field>(&self, key: K) -> Option { - match key.into() { - LookupKey::Index(idx) => { - if idx < self.domain.dimension.len() { - Some(FieldData::from(self.domain.dimension[idx].clone())) - } else if idx - < self.domain.dimension.len() + self.attributes.len() - { - Some(FieldData::from( - self.attributes[idx - self.domain.dimension.len()] - .clone(), - )) - } else { - None - } - } - LookupKey::Name(name) => { - for d in self.domain.dimension.iter() { - if d.name == name { - return Some(FieldData::from(d.clone())); - } - } - for a in self.attributes.iter() { - if a.name == name { - return Some(FieldData::from(a.clone())); - } - } - None - } - } - } - - pub fn fields(&self) -> FieldDataIter { - FieldDataIter::new(self) - } - - /// Returns the number of cells per tile - pub fn num_cells_per_tile(&self) -> usize { - match self.array_type { - ArrayType::Dense => { - // it should be safe to unwrap, the two `None` conditions must not - // be satisfied for a dense array domain - // (TODO: what about for string ascii dense domains?) - self.domain.num_cells_per_tile().unwrap() - } - ArrayType::Sparse => { - self.capacity.unwrap_or(Self::DEFAULT_SPARSE_TILE_CAPACITY) - as usize - } - } - } -} - -impl Display for SchemaData { - fn fmt(&self, f: &mut Formatter) -> FmtResult { - write!(f, "{}", json!(*self)) - } -} - -impl TryFrom<&Schema> for SchemaData { - type Error = crate::error::Error; - - fn try_from(schema: &Schema) -> TileDBResult { - Ok(SchemaData { - array_type: schema.array_type()?, - domain: DomainData::try_from(&schema.domain()?)?, - capacity: Some(schema.capacity()?), - cell_order: Some(schema.cell_order()?), - tile_order: Some(schema.tile_order()?), - allow_duplicates: Some(schema.allows_duplicates()?), - attributes: (0..schema.num_attributes()?) - .map(|a| AttributeData::try_from(&schema.attribute(a)?)) - .collect::>>()?, - coordinate_filters: FilterListData::try_from( - &schema.coordinate_filters()?, - )?, - offsets_filters: FilterListData::try_from( - &schema.offsets_filters()?, - )?, - nullity_filters: FilterListData::try_from( - &schema.nullity_filters()?, - )?, - }) - } -} - -impl TryFrom for SchemaData { - type Error = crate::error::Error; - - fn try_from(schema: Schema) -> TileDBResult { - Self::try_from(&schema) - } -} - -impl Factory for SchemaData { - type Item = Schema; - - fn create(&self, context: &Context) -> TileDBResult { - let mut b = self.attributes.iter().try_fold( - Builder::new( - context, - self.array_type, - self.domain.create(context)?, - )? - .coordinate_filters(self.coordinate_filters.create(context)?)? - .offsets_filters(self.offsets_filters.create(context)?)? - .nullity_filters(self.nullity_filters.create(context)?)?, - |b, a| b.add_attribute(a.create(context)?), - )?; - if let Some(c) = self.capacity { - b = b.capacity(c)?; - } - if let Some(d) = self.allow_duplicates { - b = b.allow_duplicates(d)?; - } - if let Some(o) = self.cell_order { - b = b.cell_order(o)?; - } - if let Some(o) = self.tile_order { - b = b.tile_order(o)?; - } - - b.build() - } -} - -pub struct FieldDataIter<'a> { - schema: &'a SchemaData, - cursor: usize, -} - -impl<'a> FieldDataIter<'a> { - pub fn new(schema: &'a SchemaData) -> Self { - FieldDataIter { schema, cursor: 0 } - } -} - -impl Iterator for FieldDataIter<'_> { - type Item = FieldData; - fn next(&mut self) -> Option { - if self.cursor < self.schema.num_fields() { - let item = self.schema.field(self.cursor); - self.cursor += 1; - Some(item.expect("Internal indexing error")) - } else { - None - } - } - - fn size_hint(&self) -> (usize, Option) { - let exact = self.schema.num_fields() - self.cursor; - (exact, Some(exact)) - } -} - -impl std::iter::FusedIterator for FieldDataIter<'_> {} - #[cfg(feature = "arrow")] pub mod arrow; -#[cfg(any(test, feature = "proptest-strategies"))] -pub mod strategy; +#[cfg(any(test, feature = "pod"))] +pub mod pod; #[cfg(test)] mod tests { - use tiledb_test_utils::{self, TestArrayUri}; + use proptest::prelude::*; + use tiledb_common::physical_type_go; + use tiledb_pod::array::attribute::AttributeData; + use tiledb_pod::array::dimension::DimensionData; + use tiledb_pod::array::domain::DomainData; + use tiledb_pod::array::schema::SchemaData; + use uri::{self, TestArrayUri}; + use utils::assert_option_subset; use super::*; use crate::array::tests::create_quickstart_dense; @@ -1045,6 +656,7 @@ mod tests { use crate::filter::{ CompressionData, CompressionType, FilterData, FilterListBuilder, }; + use crate::{Context, Factory}; fn sample_attribute(c: &Context) -> Attribute { AttributeBuilder::new(c, "a1", Datatype::Int32) @@ -1197,7 +809,7 @@ mod tests { #[test] fn test_load() -> TileDBResult<()> { - let test_uri = tiledb_test_utils::get_uri_generator() + let test_uri = uri::get_uri_generator() .map_err(|e| Error::Other(e.to_string()))?; let c: Context = Context::new().unwrap(); @@ -1674,14 +1286,37 @@ mod tests { } } + /// Test that the arbitrary schema construction always succeeds + #[test] + fn schema_arbitrary() { + let ctx = Context::new().expect("Error creating context"); + + proptest!(|(maybe_schema in any::())| { + maybe_schema.create(&ctx) + .expect("Error constructing arbitrary schema"); + }); + } + + #[test] + fn schema_eq_reflexivity() { + let ctx = Context::new().expect("Error creating context"); + + proptest!(|(schema in any::())| { + assert_eq!(schema, schema); + assert_option_subset!(schema, schema); + + let schema = schema.create(&ctx) + .expect("Error constructing arbitrary schema"); + assert_eq!(schema, schema); + }); + } + /// Test what the default values filled in for `None` with schema data are. /// Mostly because if we write code which does need the default, we're expecting /// to match core and need to be notified if something changes or we did something /// wrong. #[test] fn test_defaults() { - use crate::array::dimension::DimensionConstraints; - let ctx = Context::new().unwrap(); let dense_spec = SchemaData { @@ -1729,7 +1364,6 @@ mod tests { }], ..Default::default() }; - let sparse_schema = sparse_spec .create(&ctx) .expect("Error creating schema from mostly-default settings"); @@ -1749,32 +1383,33 @@ mod tests { /// Namely we assume that StringAscii is only allowed /// in variable-length sparse dimensions. #[test] - fn test_string_dimension() { - let mut spec = SchemaData { - array_type: ArrayType::Sparse, - domain: DomainData { - dimension: vec![DimensionData { - name: "d".to_string(), - datatype: Datatype::StringAscii, - constraints: DimensionConstraints::StringAscii, - filters: None, - }], - }, - attributes: vec![AttributeData { - name: "a".to_string(), - datatype: Datatype::Int32, - ..Default::default() - }], - ..Default::default() - }; - - assert_eq!(CellValNum::Var, spec.domain.dimension[0].cell_val_num()); - + fn test_string_dimension() -> TileDBResult<()> { let ctx = Context::new().unwrap(); + let build_schema = |array_type: ArrayType| { + Builder::new( + &ctx, + array_type, + DomainBuilder::new(&ctx)? + .add_dimension( + DimensionBuilder::new( + &ctx, + "d", + Datatype::StringAscii, + DimensionConstraints::StringAscii, + )? + .build(), + )? + .build(), + )? + .add_attribute(sample_attribute(&ctx))? + .build() + }; + // creation should succeed, StringAscii is allowed for sparse CellValNum::Var { - let schema = spec.create(&ctx).expect("Error creating schema"); + let schema = + build_schema(ArrayType::Sparse).expect("Error creating schema"); let cvn = schema .domain() .and_then(|d| d.dimension(0)) @@ -1784,10 +1419,110 @@ mod tests { } // creation should fail, StringAscii is not allowed for dense CellValNum::single() - spec.array_type = ArrayType::Dense; { - let e = spec.create(&ctx).expect_err("Successfully created schema"); - assert!(matches!(e, Error::LibTileDB(_))); + let e = build_schema(ArrayType::Dense); + assert!(matches!(e, Err(Error::LibTileDB(_)))); + } + + Ok(()) + } + + /// Creates a schema with a single dimension of the given `Datatype` with one attribute. + /// Used by the test to check if the `Datatype` can be used in this way. + fn dimension_comprehensive_schema( + context: &Context, + array_type: ArrayType, + datatype: Datatype, + ) -> TileDBResult { + let dim = physical_type_go!(datatype, DT, { + if matches!(datatype, Datatype::StringAscii) { + DimensionBuilder::new( + context, + "d", + datatype, + DimensionConstraints::StringAscii, + ) + } else { + let domain: [DT; 2] = [0 as DT, 127 as DT]; + let extent: DT = 16 as DT; + DimensionBuilder::new(context, "d", datatype, (domain, extent)) + } + })? + .build(); + + let attr = AttributeBuilder::new(context, "a", Datatype::Any)?.build(); + + let domain = DomainBuilder::new(context)?.add_dimension(dim)?.build(); + Builder::new(context, array_type, domain)? + .add_attribute(attr)? + .build() + } + + fn do_dense_dimension_comprehensive(datatype: Datatype) { + let allowed = tiledb_common::datatype::DENSE_DIMENSION_DATATYPES + .contains(&datatype); + assert_eq!(allowed, datatype.is_allowed_dimension_type_dense()); + + let context = Context::new().unwrap(); + let r = dimension_comprehensive_schema( + &context, + ArrayType::Dense, + datatype, + ); + assert_eq!(allowed, r.is_ok(), "try_construct => {:?}", r.err()); + if let Err(Error::LibTileDB(s)) = r { + assert!( + s.contains("not a valid Dimension Datatype") + || s.contains("do not support dimension datatype"), + "Expected dimension datatype error, received: {}", + s + ); + } else { + assert!( + r.is_ok(), + "Found error other than LibTileDB: {}", + r.err().unwrap() + ); + } + } + + fn do_sparse_dimension_comprehensive(datatype: Datatype) { + let allowed = tiledb_common::datatype::SPARSE_DIMENSION_DATATYPES + .contains(&datatype); + assert_eq!(allowed, datatype.is_allowed_dimension_type_sparse()); + + let context = Context::new().unwrap(); + let r = dimension_comprehensive_schema( + &context, + ArrayType::Sparse, + datatype, + ); + assert_eq!(allowed, r.is_ok(), "try_construct => {:?}", r.err()); + if let Err(Error::LibTileDB(s)) = r { + assert!( + s.contains("not a valid Dimension Datatype") + || s.contains("do not support dimension datatype"), + "Expected dimension datatype error, received: {}", + s + ); + } else { + assert!( + r.is_ok(), + "Found error other than LibTileDB: {}", + r.err().unwrap() + ); + } + } + + proptest! { + #[test] + fn dense_dimension_comprehensive(dt in any::()) { + do_dense_dimension_comprehensive(dt) + } + + #[test] + fn sparse_dimension_comprehensive(dt in any::()) { + do_sparse_dimension_comprehensive(dt) } } } diff --git a/tiledb/api/src/array/schema/pod.rs b/tiledb/api/src/array/schema/pod.rs new file mode 100644 index 00000000..7c75ec96 --- /dev/null +++ b/tiledb/api/src/array/schema/pod.rs @@ -0,0 +1,95 @@ +use tiledb_common::filter::FilterData; +use tiledb_pod::array::attribute::AttributeData; +use tiledb_pod::array::dimension::DimensionData; +use tiledb_pod::array::domain::DomainData; +use tiledb_pod::array::schema::{FieldData, SchemaData}; + +use super::{Builder, Field, Schema}; +use crate::error::Error as TileDBError; +use crate::{Context, Factory, Result as TileDBResult}; + +impl TryFrom<&Schema> for SchemaData { + type Error = TileDBError; + + fn try_from(schema: &Schema) -> Result { + Ok(SchemaData { + array_type: schema.array_type()?, + domain: DomainData::try_from(&schema.domain()?)?, + capacity: Some(schema.capacity()?), + cell_order: Some(schema.cell_order()?), + tile_order: Some(schema.tile_order()?), + allow_duplicates: Some(schema.allows_duplicates()?), + attributes: (0..schema.num_attributes()?) + .map(|a| AttributeData::try_from(&schema.attribute(a)?)) + .collect::>>()?, + coordinate_filters: Vec::::try_from( + &schema.coordinate_filters()?, + )?, + offsets_filters: Vec::::try_from( + &schema.offsets_filters()?, + )?, + nullity_filters: Vec::::try_from( + &schema.nullity_filters()?, + )?, + }) + } +} + +impl TryFrom for SchemaData { + type Error = TileDBError; + + fn try_from(schema: Schema) -> Result { + Self::try_from(&schema) + } +} + +impl Factory for SchemaData { + type Item = Schema; + + fn create(&self, context: &Context) -> TileDBResult { + let mut b = self.attributes.iter().try_fold( + Builder::new( + context, + self.array_type, + self.domain.create(context)?, + )? + .coordinate_filters(self.coordinate_filters.create(context)?)? + .offsets_filters(self.offsets_filters.create(context)?)? + .nullity_filters(self.nullity_filters.create(context)?)?, + |b, a| b.add_attribute(a.create(context)?), + )?; + if let Some(c) = self.capacity { + b = b.capacity(c)?; + } + if let Some(d) = self.allow_duplicates { + b = b.allow_duplicates(d)?; + } + if let Some(o) = self.cell_order { + b = b.cell_order(o)?; + } + if let Some(o) = self.tile_order { + b = b.tile_order(o)?; + } + + b.build() + } +} + +impl TryFrom<&Field> for FieldData { + type Error = TileDBError; + + fn try_from(field: &Field) -> Result { + match field { + Field::Dimension(d) => Ok(Self::from(DimensionData::try_from(d)?)), + Field::Attribute(a) => Ok(Self::from(AttributeData::try_from(a)?)), + } + } +} + +impl TryFrom for FieldData { + type Error = TileDBError; + + fn try_from(field: Field) -> Result { + Self::try_from(&field) + } +} diff --git a/tiledb/api/src/array/strategy.rs b/tiledb/api/src/array/strategy.rs deleted file mode 100644 index 327b5356..00000000 --- a/tiledb/api/src/array/strategy.rs +++ /dev/null @@ -1,49 +0,0 @@ -use proptest::prelude::*; - -use crate::array::TileOrder; - -impl Arbitrary for TileOrder { - type Parameters = (); - type Strategy = BoxedStrategy; - - fn arbitrary_with(_: Self::Parameters) -> Self::Strategy { - prop_oneof![Just(TileOrder::RowMajor), Just(TileOrder::ColumnMajor)] - .boxed() - } -} - -#[cfg(test)] -mod tests { - use util::assert_option_subset; - use util::option::OptionSubset; - - use tiledb_test_utils::{self, TestArrayUri}; - - use super::*; - use crate::array::schema::SchemaData; - use crate::array::{Array, Schema}; - use crate::context::Context; - use crate::error::Error; - use crate::Factory; - - #[test] - fn test_array_create() { - let ctx = Context::new().expect("Error creating context"); - - proptest!(|(schema_spec in any::())| { - let schema_in = schema_spec.create(&ctx) - .expect("Error constructing arbitrary schema"); - - let test_uri = tiledb_test_utils::get_uri_generator().map_err(|e| Error::Other(e.to_string()))?; - let uri = test_uri.with_path("array").map_err(|e| Error::Other(e.to_string()))?; - - Array::create(&ctx, &uri, schema_in) - .expect("Error creating array"); - - let schema_out = Schema::load(&ctx, &uri).expect("Error loading array schema"); - - let schema_out_spec = SchemaData::try_from(&schema_out).expect("Error creating schema spec"); - assert_option_subset!(schema_spec, schema_out_spec); - }) - } -} diff --git a/tiledb/api/src/datatype/arrow.rs b/tiledb/api/src/datatype/arrow.rs index 34dca620..e409e1a2 100644 --- a/tiledb/api/src/datatype/arrow.rs +++ b/tiledb/api/src/datatype/arrow.rs @@ -1,15 +1,6 @@ -use std::collections::HashMap; -use std::num::NonZeroU32; -use std::sync::Arc; +use arrow::datatypes::{ArrowNativeType, ArrowPrimitiveType}; -use arrow::datatypes::{ - ArrowNativeTypeOp, ArrowPrimitiveType, Field, TimeUnit, -}; - -use crate::array::CellValNum; -use crate::Datatype; - -pub trait ArrowPrimitiveTypeNative: ArrowNativeTypeOp { +pub trait ArrowPrimitiveTypeNative: ArrowNativeType { type ArrowPrimitiveType: ArrowPrimitiveType; } @@ -52,1033 +43,3 @@ impl ArrowPrimitiveTypeNative for f32 { impl ArrowPrimitiveTypeNative for f64 { type ArrowPrimitiveType = arrow::datatypes::Float64Type; } - -/// Represents tiledb (`Datatype`, `CellValNum`) compatibility for an arrow `DataType`. -#[derive(Clone, Debug, Eq, PartialEq)] -pub enum DatatypeToArrowResult { - /// There is an exact logical match for the tiledb `Datatype`. - /// The individual values of the respective types have the same bit width - /// and are meant to be interpreted the same way. - /// - /// In general, this means that: - /// 1. `CellValNum::Fixed(1)` maps to an arrow primitive or date/time type. - /// ``` - /// use tiledb::{array::CellValNum, datatype::arrow::DatatypeToArrowResult}; - /// assert_eq!(DatatypeToArrowResult::Exact(arrow::datatypes::DataType::UInt8), - /// tiledb::datatype::arrow::to_arrow(&tiledb::Datatype::UInt8, CellValNum::single())); - /// ``` - /// 2. `CellValNum::Fixed(n) if n > 1` 1 maps to an arrow fixed size list. - /// ``` - /// use arrow::datatypes::DataType as Arrow; - /// use tiledb::{Datatype as TileDB, array::CellValNum, datatype::arrow::DatatypeToArrowResult}; - /// let arrow = tiledb::datatype::arrow::to_arrow(&TileDB::UInt8, CellValNum::try_from(8).unwrap()); - /// let DatatypeToArrowResult::Exact(Arrow::FixedSizeList(item, fixed_len)) = arrow else { unreachable!() }; - /// assert_eq!(*item.data_type(), Arrow::UInt8); - /// assert_eq!(fixed_len, 8); - /// ``` - /// 3. `CellValNum::Var` maps to an arrow `LargeList`. - /// ``` - /// use arrow::datatypes::DataType as Arrow; - /// use tiledb::{Datatype as TileDB, array::CellValNum, datatype::arrow::DatatypeToArrowResult}; - /// let arrow = tiledb::datatype::arrow::to_arrow(&TileDB::UInt8, CellValNum::Var); - /// let DatatypeToArrowResult::Exact(Arrow::LargeList(item)) = arrow else { unreachable!() }; - /// assert_eq!(*item.data_type(), Arrow::UInt8); - /// ``` - /// - /// There are some exceptions, such as `(Datatype::Blob, CellValNum::Var)` - /// mapping to `arrow::datatypes::DataType::LargeBinary`, which is always variable-length. - /// - /// ``` - /// use tiledb::{array::CellValNum, datatype::arrow::DatatypeToArrowResult}; - /// assert_eq!(DatatypeToArrowResult::Exact(arrow::datatypes::DataType::LargeBinary), - /// tiledb::datatype::arrow::to_arrow(&tiledb::Datatype::Blob, CellValNum::Var)); - /// ``` - /// When the output is any kind of list, field metadata may be used to represent the exact - /// input datatype if the input on its own is an inexact match. - /// ``` - /// use arrow::datatypes::DataType as Arrow; - /// use tiledb::{Datatype as TileDB, array::CellValNum, datatype::arrow::DatatypeToArrowResult}; - /// use tiledb::datatype::arrow::{to_arrow, ARROW_FIELD_METADATA_KEY_TILEDB_TYPE_HINT}; - /// let arrow = to_arrow(&TileDB::StringAscii, CellValNum::Var); - /// let DatatypeToArrowResult::Exact(Arrow::LargeList(item)) = arrow else { unreachable!() }; - /// assert_eq!(*item.data_type(), Arrow::UInt8); - /// let Some(s) = item.metadata().get(ARROW_FIELD_METADATA_KEY_TILEDB_TYPE_HINT) - /// else { unreachable!() }; - /// assert_eq!(Some(TileDB::StringAscii), TileDB::from_string(s)); - /// ``` - Exact(arrow::datatypes::DataType), - /// There is no corresponding logical data type, but a physical data type - /// with the same bit width can be used to represent primitive values, - /// and there is a trivial or cheap conversion between value structural data. - /// ``` - /// use tiledb::{array::CellValNum, datatype::arrow::DatatypeToArrowResult}; - /// assert_eq!(DatatypeToArrowResult::Inexact(arrow::datatypes::DataType::UInt8), - /// tiledb::datatype::arrow::to_arrow(&tiledb::Datatype::StringAscii, CellValNum::single())); - /// ``` - Inexact(arrow::datatypes::DataType), -} - -impl DatatypeToArrowResult { - pub fn is_inexact(&self) -> bool { - matches!(self, Self::Inexact(_)) - } - - pub fn is_exact(&self) -> bool { - matches!(self, Self::Exact(_)) - } - - pub fn into_inner(self) -> arrow::datatypes::DataType { - match self { - Self::Exact(arrow) => arrow, - Self::Inexact(arrow) => arrow, - } - } -} - -/* - * (Datatype::StringAscii, CellValNum::Var) does not have an exact analog in Arrow. - * Utf8 sounds pretty good, but we can't use it because Arrow validates Utf8 and - * tiledb does not. So we use `LargeList(UInt8)` instead. - * However, in tiledb StringAscii has several special accommodations which - * are not granted to UInt8. We must be able to invert back to StringAscii. - * We can do that by storing the exact input datatype on the arrow list field metadata. - */ -/// `arrow::datatypes::Field` metadata key for the original `tiledb::Datatype` variant -/// if there is no exact mapping from `tiledb::Datatype` to `arrow::datatypes::DataType`. -pub const ARROW_FIELD_METADATA_KEY_TILEDB_TYPE_HINT: &str = "tiledb_type_hint"; - -pub fn to_arrow( - datatype: &Datatype, - cell_val_num: CellValNum, -) -> DatatypeToArrowResult { - use arrow::datatypes::DataType as ADT; - - type Res = DatatypeToArrowResult; - - match cell_val_num { - CellValNum::Fixed(nz) if nz.get() == 1 => { - match datatype { - Datatype::Int8 => Res::Exact(ADT::Int8), - Datatype::Int16 => Res::Exact(ADT::Int16), - Datatype::Int32 => Res::Exact(ADT::Int32), - Datatype::Int64 => Res::Exact(ADT::Int64), - Datatype::UInt8 => Res::Exact(ADT::UInt8), - Datatype::UInt16 => Res::Exact(ADT::UInt16), - Datatype::UInt32 => Res::Exact(ADT::UInt32), - Datatype::UInt64 => Res::Exact(ADT::UInt64), - Datatype::Float32 => Res::Exact(ADT::Float32), - Datatype::Float64 => Res::Exact(ADT::Float64), - Datatype::DateTimeSecond => { - Res::Exact(ADT::Timestamp(TimeUnit::Second, None)) - } - Datatype::DateTimeMillisecond => { - Res::Exact(ADT::Timestamp(TimeUnit::Millisecond, None)) - } - Datatype::DateTimeMicrosecond => { - Res::Exact(ADT::Timestamp(TimeUnit::Microsecond, None)) - } - Datatype::DateTimeNanosecond => { - Res::Exact(ADT::Timestamp(TimeUnit::Nanosecond, None)) - } - Datatype::TimeMicrosecond => { - Res::Exact(ADT::Time64(TimeUnit::Microsecond)) - } - Datatype::TimeNanosecond => { - Res::Exact(ADT::Time64(TimeUnit::Nanosecond)) - } - Datatype::Char => Res::Inexact(ADT::Int8), - Datatype::StringAscii => Res::Inexact(ADT::UInt8), - Datatype::StringUtf8 => Res::Inexact(ADT::UInt8), - Datatype::StringUtf16 => Res::Inexact(ADT::UInt16), - Datatype::StringUtf32 => Res::Inexact(ADT::UInt32), - Datatype::StringUcs2 => Res::Inexact(ADT::UInt16), - Datatype::StringUcs4 => Res::Inexact(ADT::UInt32), - Datatype::DateTimeDay - | Datatype::DateTimeYear - | Datatype::DateTimeMonth - | Datatype::DateTimeWeek - | Datatype::DateTimeHour - | Datatype::DateTimeMinute - | Datatype::DateTimePicosecond - | Datatype::DateTimeFemtosecond - | Datatype::DateTimeAttosecond - | Datatype::TimeHour - | Datatype::TimeMinute - | Datatype::TimeSecond - | Datatype::TimeMillisecond - | Datatype::TimePicosecond - | Datatype::TimeFemtosecond - | Datatype::TimeAttosecond => { - // these are signed 64-bit integers in tiledb, - // arrow datetypes with the same precision are 32 bits - // (or there is no equivalent time unit) - Res::Inexact(ADT::Int64) - } - Datatype::Blob - | Datatype::Boolean - | Datatype::GeometryWkb - | Datatype::GeometryWkt => Res::Inexact(ADT::UInt8), - Datatype::Any => { - // note that this likely is unreachable if the tiledb API is used - // correctly, as `Datatype::Any` requires `CellValNum::Var` - Res::Inexact(ADT::UInt8) - } - } - } - CellValNum::Fixed(nz) => match i32::try_from(nz.get()) { - Ok(nz) => { - if matches!(datatype, Datatype::Blob) { - Res::Exact(ADT::FixedSizeBinary(nz)) - } else { - match to_arrow(datatype, CellValNum::single()) { - Res::Exact(item) => Res::Exact(ADT::FixedSizeList( - Arc::new(arrow::datatypes::Field::new_list_field( - item, false, - )), - nz, - )), - Res::Inexact(item) => { - let metadata = HashMap::from_iter([( - ARROW_FIELD_METADATA_KEY_TILEDB_TYPE_HINT - .to_string(), - datatype.to_string(), - )]); - - let item = Arc::new( - Field::new_list_field(item, false) - .with_metadata(metadata), - ); - Res::Exact(ADT::FixedSizeList(item, nz)) - } - } - } - } - Err(_) => unimplemented!(), - }, - CellValNum::Var => { - if let Datatype::Blob = datatype { - Res::Exact(ADT::LargeBinary) - } else { - /* - * TODO: - * We could, and probably ought to, treat Utf8 in a similar fashion - * to LargeBinary as above. However, arrow (in contrast to tiledb) - * actually does to a UTF-8 integrity check. Until tiledb also - * does that, and we update our test strategies to generate - * valid UTF-8 sequences, we cannot do so. - */ - match to_arrow(datatype, CellValNum::single()) { - Res::Exact(item) => { - let item = Arc::new(Field::new_list_field(item, false)); - Res::Exact(ADT::LargeList(item)) - } - Res::Inexact(item) => { - let metadata = HashMap::from_iter([( - ARROW_FIELD_METADATA_KEY_TILEDB_TYPE_HINT - .to_string(), - datatype.to_string(), - )]); - let item = Arc::new( - Field::new_list_field(item, false) - .with_metadata(metadata), - ); - Res::Exact(ADT::LargeList(item)) - } - } - } - } - } -} - -/// Represents arrow type compatibility for a tiledb `Datatype` paired with a `CellValNum`. -#[derive(Copy, Clone, Debug, Eq, PartialEq)] -pub enum DatatypeFromArrowResult { - /// There is no reasonable matching type in tiledb. - /// This includes, but is not limited to, - /// types with 32-bit offsets; complex data types; view types; decimal types; and the null type. - None, - /// There is an exact logical match for the arrow `DataType`. - /// The individual values of the respective types have the same bit width - /// and are meant to be interpreted the same way. - /// ``` - /// use arrow::datatypes::DataType as Arrow; - /// use tiledb::{Datatype as TileDB, array::CellValNum}; - /// use tiledb::datatype::arrow::{from_arrow, DatatypeFromArrowResult}; - /// let tiledb = from_arrow(&Arrow::new_large_list(Arrow::Date32, false)); - /// assert_eq!(DatatypeFromArrowResult::Inexact(TileDB::Int32, CellValNum::Var), tiledb); - /// ``` - Exact(Datatype, CellValNum), - /// There is no corresponding logical data type, but a physical data type - /// with the same bit width can be used to represent primitive values, - /// and there is a trivial or cheap conversion between value structural data. - /// ``` - /// use arrow::datatypes::DataType as Arrow; - /// use tiledb::{Datatype as TileDB, array::CellValNum}; - /// use tiledb::datatype::arrow::{from_arrow, DatatypeFromArrowResult}; - /// let tiledb = from_arrow(&Arrow::Date32); - /// assert_eq!(DatatypeFromArrowResult::Inexact(TileDB::Int32, CellValNum::single()), tiledb); - /// ``` - Inexact(Datatype, CellValNum), -} - -impl DatatypeFromArrowResult { - pub fn is_inexact(&self) -> bool { - matches!(self, Self::Inexact(_, _)) - } - - pub fn is_exact(&self) -> bool { - matches!(self, Self::Exact(_, _)) - } - - pub fn ok(self) -> Option<(Datatype, CellValNum)> { - match self { - Self::None => None, - Self::Exact(dt, cv) => Some((dt, cv)), - Self::Inexact(dt, cv) => Some((dt, cv)), - } - } -} - -pub fn from_arrow( - value: &arrow::datatypes::DataType, -) -> DatatypeFromArrowResult { - use arrow::datatypes::DataType as ADT; - - type Res = DatatypeFromArrowResult; - - match value { - ADT::Null => Res::None, - ADT::Int8 => Res::Exact(Datatype::Int8, CellValNum::single()), - ADT::Int16 => Res::Exact(Datatype::Int16, CellValNum::single()), - ADT::Int32 => Res::Exact(Datatype::Int32, CellValNum::single()), - ADT::Int64 => Res::Exact(Datatype::Int64, CellValNum::single()), - ADT::UInt8 => Res::Exact(Datatype::UInt8, CellValNum::single()), - ADT::UInt16 => Res::Exact(Datatype::UInt16, CellValNum::single()), - ADT::UInt32 => Res::Exact(Datatype::UInt32, CellValNum::single()), - ADT::UInt64 => Res::Exact(Datatype::UInt64, CellValNum::single()), - ADT::Float16 => { - /* tiledb has no f16 type, so use u16 as a 2-byte container */ - Res::Inexact(Datatype::UInt16, CellValNum::single()) - } - ADT::Float32 => Res::Exact(Datatype::Float32, CellValNum::single()), - ADT::Float64 => Res::Exact(Datatype::Float64, CellValNum::single()), - ADT::Decimal128(_, _) | ADT::Decimal256(_, _) => { - /* - * We could map this to fixed-length blob but probably - * better to do a proper 128 or 256 bit thing in core - * so we avoid making mistakes here - */ - Res::None - } - ADT::Timestamp(TimeUnit::Second, _) => { - Res::Exact(Datatype::DateTimeSecond, CellValNum::single()) - } - ADT::Timestamp(TimeUnit::Millisecond, _) => { - Res::Exact(Datatype::DateTimeMillisecond, CellValNum::single()) - } - ADT::Timestamp(TimeUnit::Microsecond, _) => { - Res::Exact(Datatype::DateTimeMicrosecond, CellValNum::single()) - } - ADT::Timestamp(TimeUnit::Nanosecond, _) => { - Res::Exact(Datatype::DateTimeNanosecond, CellValNum::single()) - } - ADT::Date32 | ADT::Time32(_) => { - Res::Inexact(Datatype::Int32, CellValNum::single()) - } - ADT::Date64 => { - Res::Inexact(Datatype::DateTimeMillisecond, CellValNum::single()) - } - ADT::Time64(TimeUnit::Microsecond) => { - Res::Exact(Datatype::TimeMicrosecond, CellValNum::single()) - } - ADT::Time64(TimeUnit::Nanosecond) => { - Res::Exact(Datatype::TimeNanosecond, CellValNum::single()) - } - ADT::Time64(_) => Res::Inexact(Datatype::UInt64, CellValNum::single()), - ADT::Boolean => { - /* this may be bit-packed by arrow but is not by tiledb */ - Res::None - } - ADT::Duration(_) | ADT::Interval(_) => { - /* these are scalars but the doc does not specify bit width */ - Res::None - } - ADT::LargeBinary => Res::Exact(Datatype::Blob, CellValNum::Var), - ADT::FixedSizeBinary(len) => match u32::try_from(*len) { - Ok(len) => match NonZeroU32::new(len) { - None => Res::None, - Some(nz) => Res::Exact(Datatype::Blob, CellValNum::Fixed(nz)), - }, - Err(_) => Res::None, - }, - ADT::FixedSizeList(ref item, ref len) => { - let len = match u32::try_from(*len).ok().and_then(NonZeroU32::new) { - Some(len) => len, - None => return Res::None, - }; - if item.is_nullable() { - // tiledb validity applies to the entire cell, not the values within the cell. - // there is currently no way to represent null values within a cell - Res::None - } else if item.data_type().primitive_width().is_none() { - /* - * probably there are some cases we can handle, - * but let's omit for now - */ - Res::None - } else if let Some(exact_datatype) = item - .metadata() - .get(ARROW_FIELD_METADATA_KEY_TILEDB_TYPE_HINT) - .and_then(|s| Datatype::from_string(s)) - { - Res::Exact(exact_datatype, CellValNum::Fixed(len)) - } else { - match from_arrow(item.data_type()) { - Res::None => Res::None, - Res::Inexact(item, item_cell_val) => { - let cell_val_num = match item_cell_val { - CellValNum::Fixed(nz) => { - match nz.checked_mul(len) { - None => return Res::None, - Some(nz) => CellValNum::Fixed(nz), - } - } - CellValNum::Var => CellValNum::Var, - }; - Res::Inexact(item, cell_val_num) - } - Res::Exact(item, item_cell_val) => { - let cell_val_num = match item_cell_val { - CellValNum::Fixed(nz) => { - match nz.checked_mul(len) { - None => return Res::None, - Some(nz) => CellValNum::Fixed(nz), - } - } - CellValNum::Var => CellValNum::Var, - }; - Res::Exact(item, cell_val_num) - } - } - } - } - ADT::LargeUtf8 => { - /* - * NB: arrow checks for valid UTF-8 but tiledb does not. - * This is not an exact conversion for that reason - * because we cannot guarantee invertibility. - */ - Res::Inexact(Datatype::StringUtf8, CellValNum::Var) - } - ADT::LargeList(ref item) => { - if item.is_nullable() { - // tiledb validity applies to the entire cell, not the values within the cell. - // there is currently no way to represent null values within a cell - Res::None - } else if item.data_type().primitive_width().is_none() { - /* - * probably there are some cases we can handle, - * but let's omit for now - */ - Res::None - } else if let Some(exact_datatype) = item - .metadata() - .get(ARROW_FIELD_METADATA_KEY_TILEDB_TYPE_HINT) - .and_then(|s| Datatype::from_string(s)) - { - Res::Exact(exact_datatype, CellValNum::Var) - } else { - match from_arrow(item.data_type()) { - Res::None => Res::None, - Res::Inexact(item, CellValNum::Fixed(nz)) - if nz.get() == 1 => - { - Res::Inexact(item, CellValNum::Var) - } - Res::Exact(item, CellValNum::Fixed(nz)) - if nz.get() == 1 => - { - Res::Exact(item, CellValNum::Var) - } - _ => { - /* - * We probably *can* fill in more cases, but either: - * 1) we need to do work to keep the fixed cell val num around, doable but - * why bother right now - * 2) we need to keep multiple levels of offsets, not supported right now - */ - Res::None - } - } - } - } - ADT::Binary | ADT::Utf8 | ADT::List(_) => { - /* offsets are 64 bits, these types use 32-bit offsets */ - Res::None - } - ADT::BinaryView - | ADT::Utf8View - | ADT::ListView(_) - | ADT::LargeListView(_) => { - /* data does not arrive from tiledb core in this format */ - Res::None - } - ADT::Struct(_) - | ADT::Union(_, _) - | ADT::Dictionary(_, _) - | ADT::Map(_, _) - | ADT::RunEndEncoded(_, _) => { - /* complex types are not implemented */ - Res::None - } - } -} - -#[cfg(any(test, feature = "proptest-strategies"))] -pub mod strategy { - use std::sync::Arc; - - use proptest::prelude::*; - - #[derive(Clone, Debug)] - pub struct FieldParameters { - pub min_fixed_binary_len: i32, - pub max_fixed_binary_len: i32, - pub min_numeric_precision: u8, - pub max_numeric_precision: u8, - pub min_fixed_list_len: i32, - pub max_fixed_list_len: i32, - pub min_struct_fields: usize, - pub max_struct_fields: usize, - pub min_recursion_depth: u32, - pub max_recursion_depth: u32, - } - - impl Default for FieldParameters { - fn default() -> Self { - const DEFAULT_MAX_FIXED_BINARY_LEN: i32 = 1024 * 1024; - const DEFAULT_MAX_FIXED_LIST_LEN: i32 = 2048; - - FieldParameters { - min_fixed_binary_len: 1, - max_fixed_binary_len: DEFAULT_MAX_FIXED_BINARY_LEN, - min_numeric_precision: 1, - max_numeric_precision: u8::MAX, - min_fixed_list_len: 0, - max_fixed_list_len: DEFAULT_MAX_FIXED_LIST_LEN, - min_struct_fields: 0, - max_struct_fields: 16, - min_recursion_depth: 0, - max_recursion_depth: 8, - } - } - } - - pub fn any_datatype( - params: FieldParameters, - ) -> impl Strategy { - use arrow::datatypes::{ - DataType as ADT, Field, Fields, IntervalUnit, TimeUnit, - }; - - let leaf = prop_oneof![ - Just(ADT::Null), - Just(ADT::Int8), - Just(ADT::Int16), - Just(ADT::Int32), - Just(ADT::Int64), - Just(ADT::UInt8), - Just(ADT::UInt16), - Just(ADT::UInt32), - Just(ADT::UInt64), - Just(ADT::Float16), - Just(ADT::Float32), - Just(ADT::Float64), - Just(ADT::Timestamp(TimeUnit::Second, None)), - Just(ADT::Timestamp(TimeUnit::Millisecond, None)), - Just(ADT::Timestamp(TimeUnit::Microsecond, None)), - Just(ADT::Timestamp(TimeUnit::Nanosecond, None)), - Just(ADT::Date32), - Just(ADT::Date64), - Just(ADT::Time32(TimeUnit::Second)), - Just(ADT::Time32(TimeUnit::Millisecond)), - Just(ADT::Time64(TimeUnit::Microsecond)), - Just(ADT::Time64(TimeUnit::Nanosecond)), - Just(ADT::Duration(TimeUnit::Second)), - Just(ADT::Duration(TimeUnit::Millisecond)), - Just(ADT::Duration(TimeUnit::Nanosecond)), - Just(ADT::Interval(IntervalUnit::YearMonth)), - Just(ADT::Interval(IntervalUnit::DayTime)), - Just(ADT::Interval(IntervalUnit::MonthDayNano)), - Just(ADT::Binary), - (params.min_fixed_binary_len..=params.max_fixed_binary_len) - .prop_map(ADT::FixedSizeBinary), - Just(ADT::LargeBinary), - Just(ADT::Utf8), - Just(ADT::LargeUtf8), - (params.min_numeric_precision..=params.max_numeric_precision) - .prop_flat_map(|precision| ( - Just(precision), - (0..precision.clamp(0, i8::MAX as u8) as i8) - ) - .prop_map(|(precision, scale)| ADT::Decimal128( - precision, scale - ))), - (params.min_numeric_precision..=params.max_numeric_precision) - .prop_flat_map(|precision| ( - Just(precision), - (0..precision.clamp(0, i8::MAX as u8) as i8) - ) - .prop_map(|(precision, scale)| ADT::Decimal256( - precision, scale - ))), - ]; - - leaf.prop_recursive( - params.max_recursion_depth, - params.max_recursion_depth * 4, - std::cmp::max( - 2, - (params.max_struct_fields / 4).try_into().unwrap(), - ), - move |strategy| { - prop_oneof![ - (strategy.clone(), any::()) - .prop_map(|(s, b)| ADT::new_list(s, b)), - ( - strategy.clone(), - params.min_fixed_list_len..=params.max_fixed_list_len, - any::() - ) - .prop_map(|(s, l, b)| ADT::FixedSizeList( - Arc::new(Field::new_list_field(s, b)), - l - )), - (strategy.clone(), any::()).prop_map(|(s, b)| { - ADT::LargeList(Arc::new(Field::new_list_field(s, b))) - }), - proptest::collection::vec( - ( - crate::array::attribute::strategy::prop_attribute_name( - ), - strategy.clone(), - any::() - ), - params.min_struct_fields..=params.max_struct_fields - ) - .prop_map(|v| ADT::Struct( - v.into_iter() - .map(|(n, dt, b)| Field::new(n, dt, b)) - .collect::() - )) // union goes here - // dictionary goes here - // map goes here - // run-end encoded goes here - ] - }, - ) - } -} - -#[cfg(test)] -pub mod tests { - use super::*; - use proptest::prelude::*; - - fn do_to_arrow_single(tdb_dt: Datatype) { - let cell_val_num = CellValNum::single(); - let arrow_dt = to_arrow(&tdb_dt, cell_val_num); - match arrow_dt { - DatatypeToArrowResult::Inexact(arrow) => { - assert!(arrow.is_primitive()); - let arrow_size = arrow.primitive_width().unwrap() as u64; - assert_eq!( - tdb_dt.size(), - arrow_size, - "to_arrow({}, {:?}) = {}", - tdb_dt, - cell_val_num, - arrow - ); - - let tdb_out = from_arrow(&arrow); - let (tdb_out, cell_val_num_out) = tdb_out.ok().unwrap(); - - /* the datatype should not match exactly but it must be the same size */ - assert_ne!(tdb_dt, tdb_out); - assert_eq!(tdb_dt.size(), tdb_out.size()); - assert_eq!(cell_val_num, cell_val_num_out); - } - DatatypeToArrowResult::Exact(arrow) => { - assert!(arrow.is_primitive()); - let arrow_size = arrow.primitive_width().unwrap() as u64; - assert_eq!( - tdb_dt.size(), - arrow_size, - "to_arrow({}, {:?}) = {}", - tdb_dt, - cell_val_num, - arrow - ); - - let tdb_out = from_arrow(&arrow); - if let DatatypeFromArrowResult::Exact( - tdb_out, - cell_val_num_out, - ) = tdb_out - { - /* the datatype must match exactly */ - assert_eq!(tdb_dt, tdb_out); - assert_eq!(cell_val_num, cell_val_num_out); - } else { - unreachable!( - "Exact conversion did not invert, found {:?}", - tdb_out - ) - } - } - } - } - - fn do_to_arrow_nonvar(tdb_dt: Datatype) { - let fixed_len_in = 32u32; - let cell_val_num = CellValNum::try_from(fixed_len_in).unwrap(); - let arrow_dt = to_arrow(&tdb_dt, cell_val_num); - - use arrow::datatypes::DataType as ADT; - match arrow_dt { - DatatypeToArrowResult::Inexact(arrow) => { - match arrow { - ADT::FixedSizeList(ref item, fixed_len_out) => { - let item_expect = - to_arrow(&tdb_dt, CellValNum::single()); - if let DatatypeToArrowResult::Inexact(item_expect) = - item_expect - { - assert_eq!(item_expect, *item.data_type()); - assert_eq!(fixed_len_in, fixed_len_out as u32); - } else { - unreachable!( - "Expected inexact item match, found {:?}", - item_expect - ) - } - } - arrow => unreachable!( - "Expected FixedSizeList for inexact match but found {}", - arrow - ), - } - - /* invertibility */ - let tdb_out = from_arrow(&arrow); - let (tdb_out, cell_val_num_out) = tdb_out.ok().unwrap(); - - /* inexact match will not be eq, but must be the same size */ - assert_eq!(tdb_dt.size(), tdb_out.size()); - assert_eq!(cell_val_num, cell_val_num_out); - } - DatatypeToArrowResult::Exact(arrow) => { - match arrow { - ADT::FixedSizeList(ref item, fixed_len_out) => { - if let Some(sub_exact) = item - .metadata() - .get(ARROW_FIELD_METADATA_KEY_TILEDB_TYPE_HINT) - { - let sub_exact = - Datatype::from_string(sub_exact).unwrap(); - assert_eq!(sub_exact.size(), tdb_dt.size()); - - // item must have been inexact, else we would not have the metadata - let item_dt = - to_arrow(&tdb_dt, CellValNum::single()); - if let DatatypeToArrowResult::Inexact(item_dt) = - item_dt - { - assert_eq!(*item.data_type(), item_dt); - } else { - unreachable!( - "Expected inexact item match but found {:?}", - item_dt - ) - } - } else { - // item must be exact match - let item_dt = - to_arrow(&tdb_dt, CellValNum::single()); - if let DatatypeToArrowResult::Exact(item_dt) = - item_dt - { - assert_eq!(*item.data_type(), item_dt); - } else { - unreachable!( - "Expected exact item match but found {:?}", - item_dt - ) - } - } - assert_eq!(fixed_len_in, fixed_len_out as u32); - } - ADT::FixedSizeBinary(fixed_len_out) => { - assert_eq!(tdb_dt, Datatype::Blob); - assert_eq!(fixed_len_in, fixed_len_out as u32); - } - adt => unreachable!( - "to_arrow({}, {:?}) = {}", - tdb_dt, cell_val_num, adt - ), - } - - /* invertibility */ - let tdb_out = from_arrow(&arrow); - if let DatatypeFromArrowResult::Exact( - tdb_out, - cell_val_num_out, - ) = tdb_out - { - assert_eq!(tdb_dt, tdb_out); - assert_eq!(cell_val_num, cell_val_num_out); - } else { - unreachable!( - "Arrow datatype did not invert, found {:?}", - tdb_out - ) - } - } - } - } - - fn do_to_arrow_var(tdb_dt: Datatype) { - let cell_val_num = CellValNum::Var; - let arrow_dt = to_arrow(&tdb_dt, cell_val_num); - - use arrow::datatypes::DataType as ADT; - match arrow_dt { - DatatypeToArrowResult::Inexact(arrow) => { - assert!( - !arrow.is_primitive(), - "to_arrow({}, {:?}) = {}", - tdb_dt, - cell_val_num, - arrow - ); - - if let ADT::LargeList(ref item) = arrow { - let item_expect = to_arrow(&tdb_dt, CellValNum::single()); - if let DatatypeToArrowResult::Inexact(item_expect) = - item_expect - { - assert_eq!(*item.data_type(), item_expect); - } else { - unreachable!( - "Expected inexact item match, but found {:?}", - item_expect - ) - } - } else { - /* other possibilities should be Exact */ - unreachable!( - "Expected LargeList for inexact match but found {:?}", - arrow - ) - } - - let tdb_out = from_arrow(&arrow); - let (tdb_out, cell_val_num_out) = tdb_out.ok().unwrap(); - - /* must be the same size */ - assert_eq!(tdb_dt.size(), tdb_out.size()); - assert_eq!(cell_val_num, cell_val_num_out); - } - DatatypeToArrowResult::Exact(arrow) => { - assert!( - !arrow.is_primitive(), - "to_arrow({}, {:?}) = {}", - tdb_dt, - cell_val_num, - arrow - ); - - match arrow { - ADT::LargeList(ref item) => { - if let Some(sub_exact) = item - .metadata() - .get(ARROW_FIELD_METADATA_KEY_TILEDB_TYPE_HINT) - { - let sub_exact = - Datatype::from_string(sub_exact).unwrap(); - assert_eq!(sub_exact.size(), tdb_dt.size()); - - // item must not have been exact, else we would not have the metadata - let item_dt = - to_arrow(&tdb_dt, CellValNum::single()); - if let DatatypeToArrowResult::Inexact(item_dt) = - item_dt - { - assert_eq!(*item.data_type(), item_dt); - } else { - unreachable!( - "Expected inexact item match but found {:?}", - item_dt - ) - } - } else { - let item_dt = - to_arrow(&tdb_dt, CellValNum::single()); - if let DatatypeToArrowResult::Exact(item_dt) = - item_dt - { - assert_eq!(*item.data_type(), item_dt); - } else { - unreachable!( - "Expected exact item match but found {:?}", - item_dt - ) - } - } - } - ADT::LargeUtf8 => assert!(matches!( - tdb_dt, - Datatype::StringAscii | Datatype::StringUtf8 - )), - ADT::LargeBinary => { - assert!(matches!(tdb_dt, Datatype::Blob)) - } - adt => unreachable!( - "to_arrow({}, {:?}) = {}", - tdb_dt, cell_val_num, adt - ), - } - - let tdb_out = from_arrow(&arrow); - if let DatatypeFromArrowResult::Exact( - tdb_out, - cell_val_num_out, - ) = tdb_out - { - assert_eq!(tdb_dt, tdb_out); - assert_eq!(cell_val_num, cell_val_num_out); - } else { - unreachable!( - "Arrow datatype constructed from tiledb datatype must convert back") - } - } - } - } - - pub fn arrow_datatype_is_inexact_compatible( - arrow_in: &arrow::datatypes::DataType, - arrow_out: &arrow::datatypes::DataType, - ) -> bool { - if arrow_in == arrow_out { - return true; - } - - /* otherwise check some inexact compatibilities */ - use arrow::datatypes::DataType as ADT; - match (arrow_in, arrow_out) { - ( - ADT::FixedSizeList(ref item_in, len_in), - ADT::FixedSizeList(ref item_out, len_out), - ) => { - len_in == len_out - && arrow_datatype_is_inexact_compatible( - item_in.data_type(), - item_out.data_type(), - ) - } - (ADT::LargeList(ref item_in), ADT::LargeList(ref item_out)) => { - arrow_datatype_is_inexact_compatible( - item_in.data_type(), - item_out.data_type(), - ) - } - (ADT::FixedSizeList(ref item_in, 1), dt_out) => { - /* - * fixed size list of 1 element should have no extra data, - * we probably don't need to keep the FixedSizeList part - * for correctness, punt on it for now and see if we need - * to deal with it later - */ - arrow_datatype_is_inexact_compatible( - item_in.data_type(), - dt_out, - ) - } - (ADT::LargeUtf8, ADT::LargeList(ref item)) => { - /* - * Arrow does checked UTF-8, tiledb does not, - * so we must permit this inexactness - */ - *item.data_type() == arrow::datatypes::DataType::UInt8 - && !item.is_nullable() - } - (dt_in, dt_out) => { - if dt_in.is_primitive() { - dt_in.primitive_width() == dt_out.primitive_width() - } else { - false - } - } - } - } - - fn do_from_arrow(arrow_in: &arrow::datatypes::DataType) { - match from_arrow(arrow_in) { - DatatypeFromArrowResult::None => (), - DatatypeFromArrowResult::Exact(datatype, cvn) => { - let arrow_out = to_arrow(&datatype, cvn); - if let DatatypeToArrowResult::Exact(arrow_out) = arrow_out { - if let arrow::datatypes::DataType::FixedSizeList( - element, - 1, - ) = arrow_in - { - // FixedSizeList with length 1 has no way to indicate "list" - // for tiledb, so when converting back we lose the FixedSizeList - assert_eq!(*element.data_type(), arrow_out); - } else { - assert_eq!(*arrow_in, arrow_out); - } - } else { - unreachable!( - "Expected exact inversion, found {:?}", - arrow_out - ) - } - } - DatatypeFromArrowResult::Inexact(datatype, cvn) => { - let arrow_out = to_arrow(&datatype, cvn); - let arrow_out = arrow_out.into_inner(); - assert!( - arrow_datatype_is_inexact_compatible(arrow_in, &arrow_out), - "{:?} => {:?}", - arrow_in, - arrow_out - ); - } - } - } - - proptest! { - #[test] - fn test_to_arrow_single(tdb_dt in any::()) { - do_to_arrow_single(tdb_dt) - } - - #[test] - fn test_to_arrow_nonvar(tdb_dt in any::()) { - do_to_arrow_nonvar(tdb_dt); - } - - #[test] - fn test_to_arrow_var(tdb_dt in any::()) { - do_to_arrow_var(tdb_dt); - } - - #[test] - fn test_from_arrow(arrow in crate::array::attribute::arrow::strategy::prop_arrow_field()) { - do_from_arrow(arrow.data_type()); - } - } -} diff --git a/tiledb/api/src/datatype/mod.rs b/tiledb/api/src/datatype/mod.rs index ca7f1be2..fd95d5e1 100644 --- a/tiledb/api/src/datatype/mod.rs +++ b/tiledb/api/src/datatype/mod.rs @@ -1,924 +1,4 @@ -pub mod logical; -pub mod physical; - -pub use logical::*; -pub use physical::{PhysicalType, PhysicalValue}; - -use std::fmt::{Debug, Display, Formatter, Result as FmtResult}; - -use serde::{Deserialize, Serialize}; -use util::option::OptionSubset; - -use crate::error::DatatypeErrorKind; -use crate::Result as TileDBResult; - -#[derive(Clone, Copy, Default, Deserialize, Eq, Hash, PartialEq, Serialize)] -#[repr(u64)] -pub enum Datatype { - /// A 32-bit signed integer - Int32, - /// A 64-bit signed integer - Int64, - /// A 32-bit floating point value - Float32, - /// A 64-bit floating point value - Float64, - /// An 8-bit character value - Char, - /// An 8-bit signed integer - Int8, - /// An 8-bit unsigned integer - UInt8, - /// A 16-bit signed integer - Int16, - /// A 16-bit unsigned integer - UInt16, - /// A 32-bit unsigned integer - UInt32, - /// A 64-bit unsigned integer - UInt64, - /// An ASCII string - StringAscii, - /// A UTF-8 string - StringUtf8, - /// A UTF-16 string - StringUtf16, - /// A UTF-32 string - StringUtf32, - /// A UCS2 string - StringUcs2, - /// A UCS4 string - StringUcs4, - /// An arbitrary type - // Any is default to cause an error if we forget to set it on either a - // DimensionData or AttributeData instance. - #[default] - Any, - /// DateTime with year resolution - DateTimeYear, - /// DateTime with month resolution - DateTimeMonth, - /// DateTime with week resolution - DateTimeWeek, - /// DateTime with day resolution - DateTimeDay, - /// DateTime with hour resolution - DateTimeHour, - /// DateTime with minute resolution - DateTimeMinute, - /// DateTime with second resolution - DateTimeSecond, - /// DateTime with millisecond resolution - DateTimeMillisecond, - /// DateTime with microsecond resolution - DateTimeMicrosecond, - /// DateTime with nanosecond resolution - DateTimeNanosecond, - /// DateTime with picosecond resolution - DateTimePicosecond, - /// DateTime with femtosecond resolution - DateTimeFemtosecond, - /// DateTime with attosecond resolution - DateTimeAttosecond, - /// Time with hour resolution - TimeHour, - /// Time with minute resolution - TimeMinute, - /// Time with second resolution - TimeSecond, - /// Time with millisecond resolution - TimeMillisecond, - /// Time with nanosecond resolution - TimeMicrosecond, - /// Time with nanosecond resolution - TimeNanosecond, - /// Time with picosecond resolution - TimePicosecond, - /// Time with femtosecond resolution - TimeFemtosecond, - /// Time with attosecond resolution - TimeAttosecond, - /// Byte sequence - Blob, - /// Boolean - Boolean, - /// A Geometry in well-known binary (WKB) format - GeometryWkb, - /// A Geometry in well-known text (WKT) format - GeometryWkt, -} - -impl Datatype { - pub(crate) fn capi_enum(&self) -> ffi::tiledb_datatype_t { - match *self { - Datatype::Int8 => ffi::tiledb_datatype_t_TILEDB_INT8, - Datatype::Int16 => ffi::tiledb_datatype_t_TILEDB_INT16, - Datatype::Int32 => ffi::tiledb_datatype_t_TILEDB_INT32, - Datatype::Int64 => ffi::tiledb_datatype_t_TILEDB_INT64, - Datatype::Float32 => ffi::tiledb_datatype_t_TILEDB_FLOAT32, - Datatype::Float64 => ffi::tiledb_datatype_t_TILEDB_FLOAT64, - Datatype::Char => ffi::tiledb_datatype_t_TILEDB_CHAR, - Datatype::UInt8 => ffi::tiledb_datatype_t_TILEDB_UINT8, - Datatype::UInt16 => ffi::tiledb_datatype_t_TILEDB_UINT16, - Datatype::UInt32 => ffi::tiledb_datatype_t_TILEDB_UINT32, - Datatype::UInt64 => ffi::tiledb_datatype_t_TILEDB_UINT64, - Datatype::StringAscii => ffi::tiledb_datatype_t_TILEDB_STRING_ASCII, - Datatype::StringUtf8 => ffi::tiledb_datatype_t_TILEDB_STRING_UTF8, - Datatype::StringUtf16 => ffi::tiledb_datatype_t_TILEDB_STRING_UTF16, - Datatype::StringUtf32 => ffi::tiledb_datatype_t_TILEDB_STRING_UTF32, - Datatype::StringUcs2 => ffi::tiledb_datatype_t_TILEDB_STRING_UCS2, - Datatype::StringUcs4 => ffi::tiledb_datatype_t_TILEDB_STRING_UCS4, - Datatype::Any => ffi::tiledb_datatype_t_TILEDB_ANY, - Datatype::DateTimeYear => { - ffi::tiledb_datatype_t_TILEDB_DATETIME_YEAR - } - Datatype::DateTimeMonth => { - ffi::tiledb_datatype_t_TILEDB_DATETIME_MONTH - } - Datatype::DateTimeWeek => { - ffi::tiledb_datatype_t_TILEDB_DATETIME_WEEK - } - Datatype::DateTimeDay => ffi::tiledb_datatype_t_TILEDB_DATETIME_DAY, - Datatype::DateTimeHour => ffi::tiledb_datatype_t_TILEDB_DATETIME_HR, - Datatype::DateTimeMinute => { - ffi::tiledb_datatype_t_TILEDB_DATETIME_MIN - } - Datatype::DateTimeSecond => { - ffi::tiledb_datatype_t_TILEDB_DATETIME_SEC - } - Datatype::DateTimeMillisecond => { - ffi::tiledb_datatype_t_TILEDB_DATETIME_MS - } - Datatype::DateTimeMicrosecond => { - ffi::tiledb_datatype_t_TILEDB_DATETIME_US - } - Datatype::DateTimeNanosecond => { - ffi::tiledb_datatype_t_TILEDB_DATETIME_NS - } - Datatype::DateTimePicosecond => { - ffi::tiledb_datatype_t_TILEDB_DATETIME_PS - } - Datatype::DateTimeFemtosecond => { - ffi::tiledb_datatype_t_TILEDB_DATETIME_FS - } - Datatype::DateTimeAttosecond => { - ffi::tiledb_datatype_t_TILEDB_DATETIME_AS - } - Datatype::TimeHour => ffi::tiledb_datatype_t_TILEDB_TIME_HR, - Datatype::TimeMinute => ffi::tiledb_datatype_t_TILEDB_TIME_MIN, - Datatype::TimeSecond => ffi::tiledb_datatype_t_TILEDB_TIME_SEC, - Datatype::TimeMillisecond => ffi::tiledb_datatype_t_TILEDB_TIME_MS, - Datatype::TimeMicrosecond => ffi::tiledb_datatype_t_TILEDB_TIME_US, - Datatype::TimeNanosecond => ffi::tiledb_datatype_t_TILEDB_TIME_NS, - Datatype::TimePicosecond => ffi::tiledb_datatype_t_TILEDB_TIME_PS, - Datatype::TimeFemtosecond => ffi::tiledb_datatype_t_TILEDB_TIME_FS, - Datatype::TimeAttosecond => ffi::tiledb_datatype_t_TILEDB_TIME_AS, - Datatype::Blob => ffi::tiledb_datatype_t_TILEDB_BLOB, - Datatype::Boolean => ffi::tiledb_datatype_t_TILEDB_BOOL, - Datatype::GeometryWkb => ffi::tiledb_datatype_t_TILEDB_GEOM_WKB, - Datatype::GeometryWkt => ffi::tiledb_datatype_t_TILEDB_GEOM_WKT, - } - } - - pub fn size(&self) -> u64 { - let copy = *self; - unsafe { ffi::tiledb_datatype_size(copy as ffi::tiledb_datatype_t) } - } - - pub fn from_string(dtype: &str) -> Option { - let c_dtype = - std::ffi::CString::new(dtype).expect("Error creating CString"); - let mut c_ret: ffi::tiledb_datatype_t = out_ptr!(); - let res = unsafe { - ffi::tiledb_datatype_from_str( - c_dtype.as_c_str().as_ptr(), - &mut c_ret, - ) - }; - - if res == ffi::TILEDB_OK { - match Datatype::try_from(c_ret) { - Ok(dt) => Some(dt), - Err(_) => None, - } - } else { - None - } - } - - pub fn is_compatible_type(&self) -> bool { - use std::any::TypeId; - - let tid = TypeId::of::(); - if tid == TypeId::of::() { - matches!(*self, Datatype::Float32) - } else if tid == TypeId::of::() { - matches!(*self, Datatype::Float64) - } else if tid == TypeId::of::() { - matches!(*self, Datatype::Char | Datatype::Int8) - } else if tid == TypeId::of::() { - matches!( - *self, - Datatype::Any - | Datatype::Blob - | Datatype::Boolean - | Datatype::GeometryWkb - | Datatype::GeometryWkt - | Datatype::StringAscii - | Datatype::StringUtf8 - | Datatype::UInt8 - ) - } else if tid == TypeId::of::() { - matches!(*self, Datatype::Int16) - } else if tid == TypeId::of::() { - matches!( - *self, - Datatype::StringUtf16 | Datatype::StringUcs2 | Datatype::UInt16 - ) - } else if tid == TypeId::of::() { - matches!(*self, Datatype::Int32) - } else if tid == TypeId::of::() { - matches!( - *self, - Datatype::StringUtf32 | Datatype::StringUcs4 | Datatype::UInt32 - ) - } else if tid == TypeId::of::() { - matches!( - *self, - Datatype::Int64 - | Datatype::DateTimeYear - | Datatype::DateTimeMonth - | Datatype::DateTimeWeek - | Datatype::DateTimeDay - | Datatype::DateTimeHour - | Datatype::DateTimeMinute - | Datatype::DateTimeSecond - | Datatype::DateTimeMillisecond - | Datatype::DateTimeMicrosecond - | Datatype::DateTimeNanosecond - | Datatype::DateTimePicosecond - | Datatype::DateTimeFemtosecond - | Datatype::DateTimeAttosecond - | Datatype::TimeHour - | Datatype::TimeMinute - | Datatype::TimeSecond - | Datatype::TimeMillisecond - | Datatype::TimeMicrosecond - | Datatype::TimeNanosecond - | Datatype::TimePicosecond - | Datatype::TimeFemtosecond - | Datatype::TimeAttosecond - ) - } else if tid == TypeId::of::() { - matches!(*self, Datatype::UInt64) - } else { - false - } - } - - /// Returns whether this type is an integral type (i.e. integer) - // Keep in sync with sm/enums/datatype.h::datatype_is_integer - pub fn is_integral_type(&self) -> bool { - matches!( - *self, - Datatype::Boolean - | Datatype::Int8 - | Datatype::Int16 - | Datatype::Int32 - | Datatype::Int64 - | Datatype::UInt8 - | Datatype::UInt16 - | Datatype::UInt32 - | Datatype::UInt64 - ) - } - - /// Returns whether this type is a real number (i.e. floating point) - // Keep in sync with sm/enums/datatype.h::datatype_is_real - pub fn is_real_type(&self) -> bool { - matches!(*self, Datatype::Float32 | Datatype::Float64) - } - - /// Returns whether this type is a variable-length string type - // Keep in sync with sm/enums/datatype.h::datatype_is_string - pub fn is_string_type(&self) -> bool { - matches!( - *self, - Datatype::StringAscii - | Datatype::StringUtf8 - | Datatype::StringUtf16 - | Datatype::StringUtf32 - | Datatype::StringUcs2 - | Datatype::StringUcs4 - ) - } - - /// Returns whether this type is a DateTime type of any resolution - // Keep in sync with sm/enums/datatype.h::datatype_is_datetime - pub fn is_datetime_type(&self) -> bool { - matches!( - *self, - Datatype::DateTimeYear - | Datatype::DateTimeMonth - | Datatype::DateTimeWeek - | Datatype::DateTimeDay - | Datatype::DateTimeHour - | Datatype::DateTimeMinute - | Datatype::DateTimeSecond - | Datatype::DateTimeMillisecond - | Datatype::DateTimeMicrosecond - | Datatype::DateTimeNanosecond - | Datatype::DateTimePicosecond - | Datatype::DateTimeFemtosecond - | Datatype::DateTimeAttosecond - ) - } - - /// Returns whether this type is a Time type of any resolution - // Keep in sync with sm/enums/datatype.h::datatype_is_time - pub fn is_time_type(&self) -> bool { - matches!( - *self, - Datatype::TimeHour - | Datatype::TimeMinute - | Datatype::TimeSecond - | Datatype::TimeMillisecond - | Datatype::TimeMicrosecond - | Datatype::TimeNanosecond - | Datatype::TimePicosecond - | Datatype::TimeFemtosecond - | Datatype::TimeAttosecond - ) - } - - /// Returns whether this type is a byte - // Keep in sync with sm/enums/datatype.h:datatype_is_byte - pub fn is_byte_type(&self) -> bool { - matches!( - *self, - Datatype::Blob | Datatype::GeometryWkb | Datatype::GeometryWkt - ) - } - - /// Returns whether this type can be used as a dimension type of a sparse array - pub fn is_allowed_dimension_type_sparse(&self) -> bool { - !matches!(self, Datatype::Boolean) - && (self.is_integral_type() - || self.is_datetime_type() - || self.is_time_type() - || matches!( - *self, - Datatype::Float32 - | Datatype::Float64 - | Datatype::StringAscii - )) - } - - /// Returns whether this type can be used as a dimension type of a dense array - pub fn is_allowed_dimension_type_dense(&self) -> bool { - !matches!(self, Datatype::Boolean) - && (self.is_integral_type() - || self.is_datetime_type() - || self.is_time_type()) - } - - pub fn same_physical_type(&self, other: &Datatype) -> bool { - crate::physical_type_go!(self, MyPhysicalType, { - crate::physical_type_go!(other, TheirPhysicalType, { - std::any::TypeId::of::() - == std::any::TypeId::of::() - }) - }) - } - - /// Returns an `Iterator` which yields each variant of `Datatype` - /// exactly once in an unspecified order. - pub fn iter() -> impl Iterator { - static DATATYPES: [Datatype; 43] = [ - Datatype::Int32, - Datatype::Int64, - Datatype::Float32, - Datatype::Float64, - Datatype::Char, - Datatype::Int8, - Datatype::UInt8, - Datatype::Int16, - Datatype::UInt16, - Datatype::UInt32, - Datatype::UInt64, - Datatype::StringAscii, - Datatype::StringUtf8, - Datatype::StringUtf16, - Datatype::StringUtf32, - Datatype::StringUcs2, - Datatype::StringUcs4, - Datatype::DateTimeYear, - Datatype::DateTimeMonth, - Datatype::DateTimeWeek, - Datatype::DateTimeDay, - Datatype::DateTimeHour, - Datatype::DateTimeMinute, - Datatype::DateTimeSecond, - Datatype::DateTimeMillisecond, - Datatype::DateTimeMicrosecond, - Datatype::DateTimeNanosecond, - Datatype::DateTimePicosecond, - Datatype::DateTimeFemtosecond, - Datatype::DateTimeAttosecond, - Datatype::TimeHour, - Datatype::TimeMinute, - Datatype::TimeSecond, - Datatype::TimeMillisecond, - Datatype::TimeMicrosecond, - Datatype::TimeNanosecond, - Datatype::TimePicosecond, - Datatype::TimeFemtosecond, - Datatype::TimeAttosecond, - Datatype::Blob, - Datatype::Boolean, - Datatype::GeometryWkb, - Datatype::GeometryWkt, - ]; - DATATYPES.iter().copied() - } -} - -impl Debug for Datatype { - fn fmt(&self, f: &mut Formatter) -> FmtResult { - ::fmt(self, f) - } -} - -impl Display for Datatype { - fn fmt(&self, f: &mut Formatter) -> FmtResult { - let copy = *self; - let c_dtype = copy as ffi::tiledb_datatype_t; - let mut c_str = std::ptr::null::(); - let res = unsafe { ffi::tiledb_datatype_to_str(c_dtype, &mut c_str) }; - - /* - * this cannot error if you provide a valid value, and the strong Rust - * enum ensures that we have a valid value - */ - assert_eq!(res, ffi::TILEDB_OK); - - let c_msg = unsafe { std::ffi::CStr::from_ptr(c_str) }; - write!(f, "{}", c_msg.to_string_lossy()) - } -} - -impl OptionSubset for Datatype { - fn option_subset(&self, other: &Self) -> bool { - if let Datatype::Any = *self { - true - } else { - self == other - } - } -} - -impl TryFrom for Datatype { - type Error = crate::error::Error; - - fn try_from(value: ffi::tiledb_datatype_t) -> TileDBResult { - Ok(match value { - ffi::tiledb_datatype_t_TILEDB_INT8 => Datatype::Int8, - ffi::tiledb_datatype_t_TILEDB_INT16 => Datatype::Int16, - ffi::tiledb_datatype_t_TILEDB_INT32 => Datatype::Int32, - ffi::tiledb_datatype_t_TILEDB_INT64 => Datatype::Int64, - ffi::tiledb_datatype_t_TILEDB_FLOAT32 => Datatype::Float32, - ffi::tiledb_datatype_t_TILEDB_FLOAT64 => Datatype::Float64, - ffi::tiledb_datatype_t_TILEDB_CHAR => Datatype::Char, - ffi::tiledb_datatype_t_TILEDB_UINT8 => Datatype::UInt8, - ffi::tiledb_datatype_t_TILEDB_UINT16 => Datatype::UInt16, - ffi::tiledb_datatype_t_TILEDB_UINT32 => Datatype::UInt32, - ffi::tiledb_datatype_t_TILEDB_UINT64 => Datatype::UInt64, - ffi::tiledb_datatype_t_TILEDB_STRING_ASCII => Datatype::StringAscii, - ffi::tiledb_datatype_t_TILEDB_STRING_UTF8 => Datatype::StringUtf8, - ffi::tiledb_datatype_t_TILEDB_STRING_UTF16 => Datatype::StringUtf16, - ffi::tiledb_datatype_t_TILEDB_STRING_UTF32 => Datatype::StringUtf32, - ffi::tiledb_datatype_t_TILEDB_STRING_UCS2 => Datatype::StringUcs2, - ffi::tiledb_datatype_t_TILEDB_STRING_UCS4 => Datatype::StringUcs4, - ffi::tiledb_datatype_t_TILEDB_ANY => Datatype::Any, - ffi::tiledb_datatype_t_TILEDB_DATETIME_YEAR => { - Datatype::DateTimeYear - } - ffi::tiledb_datatype_t_TILEDB_DATETIME_MONTH => { - Datatype::DateTimeMonth - } - ffi::tiledb_datatype_t_TILEDB_DATETIME_WEEK => { - Datatype::DateTimeWeek - } - ffi::tiledb_datatype_t_TILEDB_DATETIME_DAY => Datatype::DateTimeDay, - ffi::tiledb_datatype_t_TILEDB_DATETIME_HR => Datatype::DateTimeHour, - ffi::tiledb_datatype_t_TILEDB_DATETIME_MIN => { - Datatype::DateTimeMinute - } - ffi::tiledb_datatype_t_TILEDB_DATETIME_SEC => { - Datatype::DateTimeSecond - } - ffi::tiledb_datatype_t_TILEDB_DATETIME_MS => { - Datatype::DateTimeMillisecond - } - ffi::tiledb_datatype_t_TILEDB_DATETIME_US => { - Datatype::DateTimeMicrosecond - } - ffi::tiledb_datatype_t_TILEDB_DATETIME_NS => { - Datatype::DateTimeNanosecond - } - ffi::tiledb_datatype_t_TILEDB_DATETIME_PS => { - Datatype::DateTimePicosecond - } - ffi::tiledb_datatype_t_TILEDB_DATETIME_FS => { - Datatype::DateTimeFemtosecond - } - ffi::tiledb_datatype_t_TILEDB_DATETIME_AS => { - Datatype::DateTimeAttosecond - } - ffi::tiledb_datatype_t_TILEDB_TIME_HR => Datatype::TimeHour, - ffi::tiledb_datatype_t_TILEDB_TIME_MIN => Datatype::TimeMinute, - ffi::tiledb_datatype_t_TILEDB_TIME_SEC => Datatype::TimeSecond, - ffi::tiledb_datatype_t_TILEDB_TIME_MS => Datatype::TimeMillisecond, - ffi::tiledb_datatype_t_TILEDB_TIME_US => Datatype::TimeMicrosecond, - ffi::tiledb_datatype_t_TILEDB_TIME_NS => Datatype::TimeNanosecond, - ffi::tiledb_datatype_t_TILEDB_TIME_PS => Datatype::TimePicosecond, - ffi::tiledb_datatype_t_TILEDB_TIME_FS => Datatype::TimeFemtosecond, - ffi::tiledb_datatype_t_TILEDB_TIME_AS => Datatype::TimeAttosecond, - ffi::tiledb_datatype_t_TILEDB_BLOB => Datatype::Blob, - ffi::tiledb_datatype_t_TILEDB_BOOL => Datatype::Boolean, - ffi::tiledb_datatype_t_TILEDB_GEOM_WKB => Datatype::GeometryWkb, - ffi::tiledb_datatype_t_TILEDB_GEOM_WKT => Datatype::GeometryWkt, - _ => { - return Err(crate::error::Error::Datatype( - DatatypeErrorKind::InvalidDiscriminant(value as u64), - )) - } - }) - } -} - -/// Apply a generic expression `$then` with a static type binding in the identifier `$typename` -/// for a logical type corresponding to the dynamic `$datatype`. -/// -/// This is similar to `physical_type_go!` but binds the logical type -/// instead of the physical type. -// note to developers: this is mimicking the C++ code -// template -// inline auto apply_with_type(Fn&& f, Datatype type, Args&&... args) -// -#[macro_export] -macro_rules! logical_type_go { - ($datatype:expr, $typename:ident, $then:expr) => {{ - type Datatype = $crate::Datatype; - match $datatype { - Datatype::Int8 => { - type $typename = $crate::datatype::logical::Int8Type; - $then - } - Datatype::Int16 => { - type $typename = $crate::datatype::logical::Int16Type; - $then - } - Datatype::Int32 => { - type $typename = $crate::datatype::logical::Int32Type; - $then - } - Datatype::Int64 => { - type $typename = $crate::datatype::logical::Int64Type; - $then - } - Datatype::UInt8 => { - type $typename = $crate::datatype::logical::UInt8Type; - $then - } - Datatype::UInt16 => { - type $typename = $crate::datatype::logical::UInt16Type; - $then - } - Datatype::UInt32 => { - type $typename = $crate::datatype::logical::UInt32Type; - $then - } - Datatype::UInt64 => { - type $typename = $crate::datatype::logical::UInt64Type; - $then - } - Datatype::Float32 => { - type $typename = $crate::datatype::logical::Float32Type; - $then - } - Datatype::Float64 => { - type $typename = $crate::datatype::logical::Float64Type; - $then - } - Datatype::Char => { - type $typename = $crate::datatype::logical::CharType; - $then - } - Datatype::StringAscii => { - type $typename = $crate::datatype::logical::StringAsciiType; - $then - } - Datatype::StringUtf8 => { - type $typename = $crate::datatype::logical::StringUtf8Type; - $then - } - Datatype::StringUtf16 => { - type $typename = $crate::datatype::logical::StringUtf16Type; - $then - } - Datatype::StringUtf32 => { - type $typename = $crate::datatype::logical::StringUtf32Type; - $then - } - Datatype::StringUcs2 => { - type $typename = $crate::datatype::logical::StringUcs2Type; - $then - } - Datatype::StringUcs4 => { - type $typename = $crate::datatype::logical::StringUcs4Type; - $then - } - Datatype::Any => { - type $typename = $crate::datatype::logical::AnyType; - $then - } - Datatype::DateTimeYear => { - type $typename = $crate::datatype::logical::DateTimeYearType; - $then - } - Datatype::DateTimeMonth => { - type $typename = $crate::datatype::logical::DateTimeMonthType; - $then - } - Datatype::DateTimeWeek => { - type $typename = $crate::datatype::logical::DateTimeWeekType; - $then - } - Datatype::DateTimeDay => { - type $typename = $crate::datatype::logical::DateTimeDayType; - $then - } - Datatype::DateTimeHour => { - type $typename = $crate::datatype::logical::DateTimeHourType; - $then - } - Datatype::DateTimeMinute => { - type $typename = $crate::datatype::logical::DateTimeMinuteType; - $then - } - Datatype::DateTimeSecond => { - type $typename = $crate::datatype::logical::DateTimeSecondType; - $then - } - Datatype::DateTimeMillisecond => { - type $typename = - $crate::datatype::logical::DateTimeMillisecondType; - $then - } - Datatype::DateTimeMicrosecond => { - type $typename = - $crate::datatype::logical::DateTimeMicrosecondType; - $then - } - Datatype::DateTimeNanosecond => { - type $typename = - $crate::datatype::logical::DateTimeNanosecondType; - $then - } - Datatype::DateTimePicosecond => { - type $typename = - $crate::datatype::logical::DateTimePicosecondType; - $then - } - Datatype::DateTimeFemtosecond => { - type $typename = - $crate::datatype::logical::DateTimeFemtosecondType; - $then - } - Datatype::DateTimeAttosecond => { - type $typename = - $crate::datatype::logical::DateTimeAttosecondType; - $then - } - Datatype::TimeHour => { - type $typename = $crate::datatype::logical::TimeHourType; - $then - } - Datatype::TimeMinute => { - type $typename = $crate::datatype::logical::TimeMinuteType; - $then - } - Datatype::TimeSecond => { - type $typename = $crate::datatype::logical::TimeSecondType; - $then - } - Datatype::TimeMillisecond => { - type $typename = $crate::datatype::logical::TimeMillisecondType; - $then - } - Datatype::TimeMicrosecond => { - type $typename = $crate::datatype::logical::TimeMicrosecondType; - $then - } - Datatype::TimeNanosecond => { - type $typename = $crate::datatype::logical::TimeNanosecondType; - $then - } - Datatype::TimePicosecond => { - type $typename = $crate::datatype::logical::TimePicosecondType; - $then - } - Datatype::TimeFemtosecond => { - type $typename = $crate::datatype::logical::TimeFemtosecondType; - $then - } - Datatype::TimeAttosecond => { - type $typename = $crate::datatype::logical::TimeAttosecondType; - $then - } - Datatype::Blob => { - type $typename = $crate::datatype::logical::BlobType; - $then - } - Datatype::Boolean => { - type $typename = $crate::datatype::logical::BooleanType; - $then - } - Datatype::GeometryWkb => { - type $typename = $crate::datatype::logical::GeometryWkbType; - $then - } - Datatype::GeometryWkt => { - type $typename = $crate::datatype::logical::GeometryWktType; - $then - } - } - }}; -} - -/// Apply a generic expression `$then` with a static type binding in the identifier `$typename` -/// for a physical type corresponding to the dynamic `$datatype`. -/// -/// This is similar to `logical_type_go!` but binds the physical type instead of logical -/// type which is useful for calling generic functions and methods with a `PhysicalType` -/// trait bound. -/// -/// # Examples -/// -/// ``` -/// use tiledb::{physical_type_go, Context, Datatype}; -/// use tiledb::array::dimension::{Dimension, DimensionConstraints, Builder}; -/// -/// fn dimension_num_cells(d: &Dimension) -> Option { -/// physical_type_go!(d.datatype().unwrap(), DT, { -/// d.domain::
().unwrap().map(|[low, high]| (high - low) as u64 + 1) -/// }) -/// } -/// let ctx = Context::new().unwrap(); -/// -/// let d1 = Builder::new(&ctx, "d1", Datatype::UInt32, -/// DimensionConstraints::UInt32([0, 16], Some(4))).unwrap().build(); -/// assert_eq!(Some(17), dimension_num_cells(&d1)); -/// -/// let d2 = Builder::new(&ctx, "d2", Datatype::Int8, -/// DimensionConstraints::Int8([-4, 4], Some(2))).unwrap().build(); -/// assert_eq!(Some(9), dimension_num_cells(&d2)); -/// ``` -#[macro_export] -macro_rules! physical_type_go { - ($datatype:expr, $typename:ident, $then:expr) => {{ - $crate::logical_type_go!($datatype, PhysicalTypeGoLogicalType, { - type $typename = ::PhysicalType; - $then - }) - }}; -} - #[cfg(feature = "arrow")] pub mod arrow; -#[cfg(any(test, feature = "proptest-strategies"))] -pub mod strategy; - -#[cfg(test)] -mod tests { - use std::collections::HashSet; - - use proptest::prelude::*; - use util::{assert_not_option_subset, assert_option_subset}; - - use super::*; - - #[test] - fn datatype_roundtrips() { - for i in 0..256 { - let maybe_dt = Datatype::try_from(i); - if maybe_dt.is_ok() { - let dt = maybe_dt.unwrap(); - let dt_str = dt.to_string(); - let str_dt = Datatype::from_string(&dt_str) - .expect("Error round tripping datatype string."); - assert_eq!(str_dt, dt); - } - } - } - - #[test] - fn datatype_test() { - for i in 0..256 { - println!("I: {}", i); - if i <= 43 { - let dt = Datatype::try_from(i as u32) - .expect("Error converting value to Datatype"); - assert_ne!( - format!("{}", dt), - "".to_string() - ); - assert!(check_valid(&dt)); - } else { - assert!(Datatype::try_from(i as u32).is_err()); - } - } - } - - #[test] - fn iter() { - let mut yielded = HashSet::::new(); - for dt in Datatype::iter() { - let prev = yielded.insert(dt); - assert!(prev); - } - } - - fn check_valid(dt: &Datatype) -> bool { - let mut count = 0; - - if dt.is_compatible_type::() { - count += 1; - } - - if dt.is_compatible_type::() { - count += 1; - } - - if dt.is_compatible_type::() { - count += 1; - } - - if dt.is_compatible_type::() { - count += 1; - } - - if dt.is_compatible_type::() { - count += 1; - } - - if dt.is_compatible_type::() { - count += 1; - } - - if dt.is_compatible_type::() { - count += 1; - } - - if dt.is_compatible_type::() { - count += 1; - } - - if dt.is_compatible_type::() { - count += 1; - } - - if dt.is_compatible_type::() { - count += 1; - } - - count == 1 - } - - #[test] - fn option_subset() { - assert_option_subset!(Datatype::Any, Datatype::Any); - assert_option_subset!(Datatype::Any, Datatype::UInt16); - assert_option_subset!(Datatype::Any, Datatype::UInt32); - assert_option_subset!(Datatype::UInt16, Datatype::UInt16); - assert_option_subset!(Datatype::UInt32, Datatype::UInt32); - assert_not_option_subset!(Datatype::UInt32, Datatype::Any); - assert_not_option_subset!(Datatype::UInt32, Datatype::UInt16); - assert_not_option_subset!(Datatype::UInt16, Datatype::Any); - assert_not_option_subset!(Datatype::UInt16, Datatype::UInt32); - } - - proptest! { - #[test] - fn logical_type(dt in any::()) { - logical_type_go!(dt, LT, { - let lt_constant = ::DATA_TYPE; - assert_eq!(dt, lt_constant); - - assert!(dt.is_compatible_type::<::PhysicalType>()); - }) - } - } -} +pub use tiledb_common::datatype::*; diff --git a/tiledb/api/src/datatype/strategy.rs b/tiledb/api/src/datatype/strategy.rs deleted file mode 100644 index 4807a652..00000000 --- a/tiledb/api/src/datatype/strategy.rs +++ /dev/null @@ -1,338 +0,0 @@ -use proptest::prelude::*; - -use crate::Datatype; - -fn prop_datatype() -> impl Strategy { - prop_oneof![ - Just(Datatype::Int8), - Just(Datatype::Int16), - Just(Datatype::Int32), - Just(Datatype::Int64), - Just(Datatype::UInt8), - Just(Datatype::UInt16), - Just(Datatype::UInt32), - Just(Datatype::UInt64), - Just(Datatype::Float32), - Just(Datatype::Float64), - Just(Datatype::Char), - Just(Datatype::StringAscii), - Just(Datatype::StringUtf8), - Just(Datatype::StringUtf16), - Just(Datatype::StringUtf32), - Just(Datatype::StringUcs2), - Just(Datatype::StringUcs4), - Just(Datatype::Any), - Just(Datatype::DateTimeYear), - Just(Datatype::DateTimeMonth), - Just(Datatype::DateTimeWeek), - Just(Datatype::DateTimeDay), - Just(Datatype::DateTimeHour), - Just(Datatype::DateTimeMinute), - Just(Datatype::DateTimeSecond), - Just(Datatype::DateTimeMillisecond), - Just(Datatype::DateTimeMicrosecond), - Just(Datatype::DateTimeNanosecond), - Just(Datatype::DateTimePicosecond), - Just(Datatype::DateTimeFemtosecond), - Just(Datatype::DateTimeAttosecond), - Just(Datatype::TimeHour), - Just(Datatype::TimeMinute), - Just(Datatype::TimeSecond), - Just(Datatype::TimeMillisecond), - Just(Datatype::TimeMicrosecond), - Just(Datatype::TimeNanosecond), - Just(Datatype::TimePicosecond), - Just(Datatype::TimeFemtosecond), - Just(Datatype::TimeAttosecond), - Just(Datatype::Blob), - Just(Datatype::Boolean), - Just(Datatype::GeometryWkb), - Just(Datatype::GeometryWkt), - ] -} - -const DENSE_DIMENSION_DATATYPES: [Datatype; 30] = [ - Datatype::Int8, - Datatype::Int16, - Datatype::Int32, - Datatype::Int64, - Datatype::UInt8, - Datatype::UInt16, - Datatype::UInt32, - Datatype::UInt64, - Datatype::DateTimeYear, - Datatype::DateTimeMonth, - Datatype::DateTimeWeek, - Datatype::DateTimeDay, - Datatype::DateTimeHour, - Datatype::DateTimeMinute, - Datatype::DateTimeSecond, - Datatype::DateTimeMillisecond, - Datatype::DateTimeMicrosecond, - Datatype::DateTimeNanosecond, - Datatype::DateTimePicosecond, - Datatype::DateTimeFemtosecond, - Datatype::DateTimeAttosecond, - Datatype::TimeHour, - Datatype::TimeMinute, - Datatype::TimeSecond, - Datatype::TimeMillisecond, - Datatype::TimeMicrosecond, - Datatype::TimeNanosecond, - Datatype::TimePicosecond, - Datatype::TimeFemtosecond, - Datatype::TimeAttosecond, -]; - -const SPARSE_DIMENSION_DATATYPES: [Datatype; 33] = [ - Datatype::Int8, - Datatype::Int16, - Datatype::Int32, - Datatype::Int64, - Datatype::UInt8, - Datatype::UInt16, - Datatype::UInt32, - Datatype::UInt64, - Datatype::Float32, - Datatype::Float64, - Datatype::DateTimeYear, - Datatype::DateTimeMonth, - Datatype::DateTimeWeek, - Datatype::DateTimeDay, - Datatype::DateTimeHour, - Datatype::DateTimeMinute, - Datatype::DateTimeSecond, - Datatype::DateTimeMillisecond, - Datatype::DateTimeMicrosecond, - Datatype::DateTimeNanosecond, - Datatype::DateTimePicosecond, - Datatype::DateTimeFemtosecond, - Datatype::DateTimeAttosecond, - Datatype::TimeHour, - Datatype::TimeMinute, - Datatype::TimeSecond, - Datatype::TimeMillisecond, - Datatype::TimeMicrosecond, - Datatype::TimeNanosecond, - Datatype::TimePicosecond, - Datatype::TimeFemtosecond, - Datatype::TimeAttosecond, - Datatype::StringAscii, -]; - -fn prop_datatype_for_dense_dimension() -> impl Strategy { - /* see `Datatype::is_allowed_dimension_type_dense` */ - proptest::strategy::Union::new( - DENSE_DIMENSION_DATATYPES.iter().map(|dt| Just(*dt)), - ) -} - -fn prop_datatype_for_sparse_dimension() -> impl Strategy { - /* see `Datatype::is_allowed_dimension_type_sparse` */ - proptest::strategy::Union::new( - SPARSE_DIMENSION_DATATYPES.iter().map(|dt| Just(*dt)), - ) -} - -const DELTA_FILTER_REINTERPRET_DATATYPES: [Datatype; 37] = [ - Datatype::Any, - Datatype::UInt8, - Datatype::UInt16, - Datatype::UInt32, - Datatype::UInt64, - Datatype::Int8, - Datatype::Int16, - Datatype::Int32, - Datatype::Int64, - Datatype::Float32, - Datatype::Float64, - Datatype::Boolean, - Datatype::Blob, - Datatype::GeometryWkb, - Datatype::GeometryWkt, - Datatype::DateTimeYear, - Datatype::DateTimeMonth, - Datatype::DateTimeWeek, - Datatype::DateTimeDay, - Datatype::DateTimeHour, - Datatype::DateTimeMinute, - Datatype::DateTimeSecond, - Datatype::DateTimeMillisecond, - Datatype::DateTimeMicrosecond, - Datatype::DateTimeNanosecond, - Datatype::DateTimePicosecond, - Datatype::DateTimeFemtosecond, - Datatype::DateTimeAttosecond, - Datatype::TimeHour, - Datatype::TimeMinute, - Datatype::TimeSecond, - Datatype::TimeMillisecond, - Datatype::TimeMicrosecond, - Datatype::TimeNanosecond, - Datatype::TimePicosecond, - Datatype::TimeFemtosecond, - Datatype::TimeAttosecond, -]; - -fn prop_datatype_for_delta_filter() -> impl Strategy { - // see core `FilterBuffer::buffers_as` - proptest::strategy::Union::new( - DELTA_FILTER_REINTERPRET_DATATYPES - .iter() - .map(|dt| Just(*dt)), - ) -} - -#[derive(Clone, Debug, Default)] -pub enum DatatypeContext { - #[default] - Any, - NotAny, - DenseDimension, - SparseDimension, - DeltaFilterReinterpretDatatype, - Fixed(Datatype), -} - -impl Arbitrary for Datatype { - type Parameters = DatatypeContext; - type Strategy = BoxedStrategy; - - fn arbitrary_with(p: Self::Parameters) -> Self::Strategy { - match p { - DatatypeContext::Any => prop_datatype().boxed(), - DatatypeContext::NotAny => prop_datatype() - .prop_filter("Datatype::Any", |dt| *dt != Datatype::Any) - .boxed(), - DatatypeContext::DenseDimension => { - prop_datatype_for_dense_dimension().boxed() - } - DatatypeContext::SparseDimension => { - prop_datatype_for_sparse_dimension().boxed() - } - DatatypeContext::DeltaFilterReinterpretDatatype => { - prop_datatype_for_delta_filter().boxed() - } - DatatypeContext::Fixed(dt) => Just(dt).boxed(), - } - } -} - -#[cfg(test)] -mod tests { - use super::*; - use crate::array::dimension::DimensionConstraints; - use crate::array::{ - ArrayType, AttributeBuilder, DimensionBuilder, DomainBuilder, Schema, - SchemaBuilder, - }; - use crate::error::Error; - use crate::{Context, Result as TileDBResult}; - - /// Creates a schema with a single dimension of the given `Datatype` with one attribute. - /// Used by the test to check if the `Datatype` can be used in this way. - fn dimension_comprehensive_schema( - context: &Context, - array_type: ArrayType, - datatype: Datatype, - ) -> TileDBResult { - let dim = physical_type_go!(datatype, DT, { - if matches!(datatype, Datatype::StringAscii) { - DimensionBuilder::new( - context, - "d", - datatype, - DimensionConstraints::StringAscii, - ) - } else { - let domain: [DT; 2] = [0 as DT, 127 as DT]; - let extent: DT = 16 as DT; - DimensionBuilder::new(context, "d", datatype, (domain, extent)) - } - })? - .build(); - - let attr = AttributeBuilder::new(context, "a", Datatype::Any)?.build(); - - let domain = DomainBuilder::new(context)?.add_dimension(dim)?.build(); - SchemaBuilder::new(context, array_type, domain)? - .add_attribute(attr)? - .build() - } - - fn do_dense_dimension_comprehensive(datatype: Datatype) { - let allowed = DENSE_DIMENSION_DATATYPES.contains(&datatype); - assert_eq!(allowed, datatype.is_allowed_dimension_type_dense()); - - let context = Context::new().unwrap(); - let r = dimension_comprehensive_schema( - &context, - ArrayType::Dense, - datatype, - ); - assert_eq!(allowed, r.is_ok(), "try_construct => {:?}", r.err()); - if let Err(Error::LibTileDB(s)) = r { - assert!( - s.contains("not a valid Dimension Datatype") - || s.contains("do not support dimension datatype"), - "Expected dimension datatype error, received: {}", - s - ); - } else { - assert!( - r.is_ok(), - "Found error other than LibTileDB: {}", - r.err().unwrap() - ); - } - } - - fn do_sparse_dimension_comprehensive(datatype: Datatype) { - let allowed = SPARSE_DIMENSION_DATATYPES.contains(&datatype); - assert_eq!(allowed, datatype.is_allowed_dimension_type_sparse()); - - let context = Context::new().unwrap(); - let r = dimension_comprehensive_schema( - &context, - ArrayType::Sparse, - datatype, - ); - assert_eq!(allowed, r.is_ok(), "try_construct => {:?}", r.err()); - if let Err(Error::LibTileDB(s)) = r { - assert!( - s.contains("not a valid Dimension Datatype") - || s.contains("do not support dimension datatype"), - "Expected dimension datatype error, received: {}", - s - ); - } else { - assert!( - r.is_ok(), - "Found error other than LibTileDB: {}", - r.err().unwrap() - ); - } - } - - proptest! { - #[test] - fn dense_dimension(dt in any_with::(DatatypeContext::DenseDimension)) { - assert!(dt.is_allowed_dimension_type_dense()) - } - - #[test] - fn dense_dimension_comprehensive(dt in any::()) { - do_dense_dimension_comprehensive(dt) - } - - #[test] - fn sparse_dimension(dt in any_with::(DatatypeContext::SparseDimension)) { - assert!(dt.is_allowed_dimension_type_sparse()) - } - - #[test] - fn sparse_dimension_comprehensive(dt in any::()) { - do_sparse_dimension_comprehensive(dt) - } - } -} diff --git a/tiledb/api/src/error.rs b/tiledb/api/src/error.rs index 7dcd843c..8b07020f 100644 --- a/tiledb/api/src/error.rs +++ b/tiledb/api/src/error.rs @@ -3,10 +3,13 @@ extern crate tiledb_sys as ffi; use std::fmt::{Debug, Display, Formatter, Result as FmtResult}; use std::ops::Deref; -use crate::array::CellValNum; -use crate::Datatype; +#[cfg(feature = "serde")] use serde::{Serialize, Serializer}; +use tiledb_common::array::CellValNum; + +pub use tiledb_common::datatype::Error as DatatypeError; + pub(crate) enum RawError { Owned(*mut ffi::tiledb_error_t), } @@ -26,107 +29,6 @@ impl Drop for RawError { } } -#[derive(Clone, Debug)] -pub enum DatatypeErrorKind { - InvalidDiscriminant(u64), - TypeMismatch { - user_type: String, - tiledb_type: Datatype, - }, - PhysicalTypeMismatch { - requested_type: String, - actual_type: String, - }, - UnexpectedCellStructure { - context: Option, - found: CellValNum, - expected: CellValNum, - }, - UnexpectedValidity { - context: Option, - }, - InvalidDatatype { - context: Option, - found: Datatype, - expected: Datatype, - }, -} - -impl Display for DatatypeErrorKind { - fn fmt(&self, f: &mut Formatter) -> FmtResult { - match self { - DatatypeErrorKind::InvalidDiscriminant(value) => { - write!(f, "Invalid datatype: {}", value) - } - DatatypeErrorKind::TypeMismatch { - user_type, - tiledb_type, - } => { - write!( - f, - "Type mismatch: requested {}, but found {}", - user_type, tiledb_type - ) - } - DatatypeErrorKind::PhysicalTypeMismatch { - requested_type, - actual_type, - } => { - write!( - f, - "Physical type mismatch: requested {}, but found {}", - requested_type, actual_type - ) - } - DatatypeErrorKind::UnexpectedCellStructure { - ref context, - found, - expected, - } => { - if let Some(context) = context.as_ref() { - write!( - f, - "Unexpected cell val num for {}: expected {}, found {}", - context, expected, found - ) - } else { - write!( - f, - "Unexpected cell val num: expected {}, found {}", - expected, found - ) - } - } - DatatypeErrorKind::UnexpectedValidity { context } => { - if let Some(context) = context { - write!(f, "Unexpected validity data for {}", context) - } else { - write!(f, "Unexpected validity data") - } - } - DatatypeErrorKind::InvalidDatatype { - ref context, - found, - expected, - } => { - if let Some(context) = context.as_ref() { - write!( - f, - "Unexpected datatype for {}: expected {}, found {}", - context, expected, found - ) - } else { - write!( - f, - "Unexpected datatype: expected {}, found {}", - expected, found - ) - } - } - } - } -} - #[derive(Clone, Debug)] pub enum ObjectTypeErrorKind { InvalidDiscriminant(u64), @@ -142,21 +44,6 @@ impl Display for ObjectTypeErrorKind { } } -#[derive(Clone, Debug)] -pub enum ModeErrorKind { - InvalidDiscriminant(u64), -} - -impl Display for ModeErrorKind { - fn fmt(&self, f: &mut Formatter) -> FmtResult { - match self { - ModeErrorKind::InvalidDiscriminant(value) => { - write!(f, "Invalid mode type: {}", value) - } - } - } -} - #[derive(Debug, thiserror::Error)] pub enum Error { /// Internal error due to bugs in tiledb. @@ -177,13 +64,52 @@ pub enum Error { InvalidIndex(usize), /// Error with datatype handling #[error("Datatype error: {0}")] - Datatype(DatatypeErrorKind), + Datatype(#[from] DatatypeError), /// Error with ObjectType handling #[error("Object type error: {0}")] ObjectType(ObjectTypeErrorKind), + #[error("Datatype interface error: {0}")] + DatatypeFFIError(#[from] tiledb_common::datatype::TryFromFFIError), /// Error with Mode handling #[error("Mode type error: {0}")] - ModeType(ModeErrorKind), + ModeType(#[from] tiledb_common::array::ModeError), + #[error("ArrayType error: {0}")] + ArrayTypeError(#[from] tiledb_common::array::ArrayTypeError), + #[error("CellValNum error: {0}")] + CellValNumError(#[from] tiledb_common::array::CellValNumError), + #[error("CellOrder error: {0}")] + CellOrder(#[from] tiledb_common::array::CellOrderError), + #[error("TileOrder error: {0}")] + TileOrder(#[from] tiledb_common::array::TileOrderError), + #[error("FilterType error: {0}")] + FilterType(#[from] crate::filter::FilterTypeError), + #[error("FilterOption error: {0}")] + FilterOption(#[from] crate::filter::FilterOptionError), + #[error("WebPFilter error: {0}")] + WebPFilterType(#[from] crate::filter::WebPFilterError), + #[error("ScaleFloatByteWidth error: {0}")] + ScaleFloatFilter(#[from] crate::filter::ScaleFloatByteWidthError), + #[error("Dimension error: {0}")] + DimensionError(#[from] tiledb_common::array::dimension::Error), + #[error("Dimension range error: {0}")] + DimensionRangeError( + #[from] tiledb_common::range::DimensionCompatibilityError, + ), + #[error("FromFillValue error: {0}")] + FromFillValueError( + #[from] tiledb_common::array::attribute::FromFillValueError, + ), + #[error("Range raw data error: {0}")] + RangeRawDataError(#[from] tiledb_common::range::RangeFromSlicesError), + #[error("Multi-value range error: {0}")] + MultiValueRangeError(#[from] tiledb_common::range::MultiValueRangeError), + #[error("Unexpected {}: expected {expected}, found {found}", std::any::type_name::())] + UnexpectedCellStructure { + expected: CellValNum, + found: CellValNum, + }, + #[error("Unexpected null values")] + UnexpectedValidity, /// Error serializing data #[error("Serialization error: {0}: {1}")] Serialization(String, #[source] anyhow::Error), @@ -199,15 +125,6 @@ pub enum Error { Other(String), } -impl Error { - pub(crate) fn physical_type_mismatch() -> Self { - Self::Datatype(DatatypeErrorKind::PhysicalTypeMismatch { - requested_type: std::any::type_name::().to_owned(), - actual_type: std::any::type_name::().to_owned(), - }) - } -} - impl From for Error { fn from(e: RawError) -> Self { let mut c_msg: *const std::os::raw::c_char = out_ptr!(); @@ -227,6 +144,7 @@ impl From for Error { } } +#[cfg(feature = "serde")] impl Serialize for Error { fn serialize(&self, serializer: S) -> Result where diff --git a/tiledb/api/src/filter/arrow.rs b/tiledb/api/src/filter/arrow.rs index 4fff9b55..0a30971b 100644 --- a/tiledb/api/src/filter/arrow.rs +++ b/tiledb/api/src/filter/arrow.rs @@ -39,15 +39,17 @@ impl FilterMetadata { #[cfg(test)] mod tests { + use proptest::prelude::*; + use tiledb_pod::filter::strategy::FilterPipelineStrategy; + use super::*; use crate::{Context, Factory}; - use proptest::prelude::*; #[test] fn test_serialize_invertibility() { let c: TileDBContext = Context::new().unwrap(); - proptest!(|(filters_in in any::())| { + proptest!(|(filters_in in FilterPipelineStrategy::default())| { let filters_in = filters_in.create(&c) .expect("Error constructing arbitrary filter list"); let metadata = FilterMetadata::new(&filters_in) diff --git a/tiledb/api/src/filter/ftype.rs b/tiledb/api/src/filter/ftype.rs index ed816903..2fd81f76 100644 --- a/tiledb/api/src/filter/ftype.rs +++ b/tiledb/api/src/filter/ftype.rs @@ -1,7 +1,15 @@ -use crate::error::Error; -use crate::Result as TileDBResult; +use thiserror::Error; +use tiledb_common::filter::{ + ChecksumType, CompressionData, CompressionType, FilterData, +}; -#[derive(Clone, Debug, PartialEq)] +#[derive(Clone, Debug, Error)] +pub enum Error { + #[error("Invalid discriminant for {}: {0}", std::any::type_name::())] + InvalidDiscriminant(u64), +} + +#[derive(Clone, Copy, Debug, PartialEq)] pub enum FilterType { None, Gzip, @@ -23,9 +31,9 @@ pub enum FilterType { Delta, } -impl FilterType { - pub(crate) fn capi_enum(&self) -> ffi::tiledb_filter_type_t { - match *self { +impl From for ffi::tiledb_filter_type_t { + fn from(value: FilterType) -> Self { + match value { FilterType::None => ffi::tiledb_filter_type_t_TILEDB_FILTER_NONE, FilterType::Gzip => ffi::tiledb_filter_type_t_TILEDB_FILTER_GZIP, FilterType::Zstd => ffi::tiledb_filter_type_t_TILEDB_FILTER_ZSTD, @@ -64,46 +72,11 @@ impl FilterType { FilterType::Delta => ffi::tiledb_filter_type_t_TILEDB_FILTER_DELTA, } } - - pub fn to_string(&self) -> TileDBResult { - let mut c_str = std::ptr::null::(); - let res = unsafe { - ffi::tiledb_filter_type_to_str(self.capi_enum(), &mut c_str) - }; - if res == ffi::TILEDB_OK { - let c_msg = unsafe { std::ffi::CStr::from_ptr(c_str) }; - Ok(String::from(c_msg.to_string_lossy())) - } else { - Err(Error::LibTileDB(format!( - "Error converting filter type: {:?} to string", - self - ))) - } - } - - pub fn from_string(fs: &str) -> TileDBResult { - let c_ftype = - std::ffi::CString::new(fs).expect("Error creating CString"); - std::ffi::CString::new(fs).expect("Error creating CString"); - let mut c_ret: u32 = 0; - let res = unsafe { - ffi::tiledb_filter_type_from_str( - c_ftype.as_c_str().as_ptr(), - &mut c_ret, - ) - }; - - if res == ffi::TILEDB_OK { - FilterType::try_from(c_ret) - } else { - Err(Error::LibTileDB(format!("Invalid filter type: {}", fs))) - } - } } impl TryFrom for FilterType { - type Error = crate::error::Error; - fn try_from(value: u32) -> TileDBResult { + type Error = Error; + fn try_from(value: u32) -> std::result::Result { match value { ffi::tiledb_filter_type_t_TILEDB_FILTER_NONE => { Ok(FilterType::None) @@ -153,30 +126,66 @@ impl TryFrom for FilterType { ffi::tiledb_filter_type_t_TILEDB_FILTER_DELTA => { Ok(FilterType::Delta) } - _ => Err(Self::Error::LibTileDB(format!( - "Invalid filter type: {}", - value - ))), + _ => Err(Error::InvalidDiscriminant(value as u64)), } } } -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn filter_type_roundtrips() { - for i in 0..256 { - let maybe_ftype = FilterType::try_from(i); - if maybe_ftype.is_ok() { - let ftype = maybe_ftype.unwrap(); - let ftype_str = - ftype.to_string().expect("Error creating string."); - let str_ftype = FilterType::from_string(&ftype_str) - .expect("Error round tripping filter type string."); - assert_eq!(str_ftype, ftype); - } +impl From<&FilterData> for FilterType { + fn from(value: &FilterData) -> Self { + match value { + FilterData::None => FilterType::None, + FilterData::BitShuffle { .. } => FilterType::BitShuffle, + FilterData::ByteShuffle { .. } => FilterType::ByteShuffle, + FilterData::BitWidthReduction { .. } => { + FilterType::BitWidthReduction + } + FilterData::Checksum(ChecksumType::Md5) => FilterType::ChecksumMD5, + FilterData::Checksum(ChecksumType::Sha256) => { + FilterType::ChecksumSHA256 + } + FilterData::Compression(CompressionData { + kind: CompressionType::Bzip2, + .. + }) => FilterType::Bzip2, + FilterData::Compression(CompressionData { + kind: CompressionType::Delta { .. }, + .. + }) => FilterType::Delta, + FilterData::Compression(CompressionData { + kind: CompressionType::Dictionary, + .. + }) => FilterType::Dictionary, + FilterData::Compression(CompressionData { + kind: CompressionType::DoubleDelta { .. }, + .. + }) => FilterType::DoubleDelta, + FilterData::Compression(CompressionData { + kind: CompressionType::Gzip, + .. + }) => FilterType::Gzip, + FilterData::Compression(CompressionData { + kind: CompressionType::Lz4, + .. + }) => FilterType::Lz4, + FilterData::Compression(CompressionData { + kind: CompressionType::Rle, + .. + }) => FilterType::Rle, + FilterData::Compression(CompressionData { + kind: CompressionType::Zstd, + .. + }) => FilterType::Zstd, + FilterData::PositiveDelta { .. } => FilterType::PositiveDelta, + FilterData::ScaleFloat { .. } => FilterType::ScaleFloat, + FilterData::WebP { .. } => FilterType::WebP, + FilterData::Xor => FilterType::Xor, } } } + +impl From for FilterType { + fn from(value: FilterData) -> Self { + FilterType::from(&value) + } +} diff --git a/tiledb/api/src/filter/list.rs b/tiledb/api/src/filter/list.rs index 682d2372..4a8fda03 100644 --- a/tiledb/api/src/filter/list.rs +++ b/tiledb/api/src/filter/list.rs @@ -2,9 +2,6 @@ use std::borrow::Borrow; use std::fmt::{Debug, Formatter, Result as FmtResult}; use std::ops::Deref; -use serde::{Deserialize, Serialize}; -use util::option::OptionSubset; - use crate::context::{CApiInterface, Context, ContextBound}; use crate::filter::{Filter, FilterData, RawFilter}; use crate::Result as TileDBResult; @@ -183,75 +180,6 @@ impl Builder { } } -#[derive( - Clone, Debug, Default, Deserialize, OptionSubset, PartialEq, Serialize, -)] -pub struct FilterListData(Vec); - -impl FilterListData { - pub fn into_inner(self) -> Vec { - self.0 - } -} - -impl Deref for FilterListData { - type Target = Vec; - - fn deref(&self) -> &Self::Target { - &self.0 - } -} - -impl From for FilterListData -where - V: Into>, -{ - fn from(value: V) -> Self { - Self(value.into()) - } -} - -impl FromIterator for FilterListData { - fn from_iter(iter: T) -> Self - where - T: IntoIterator, - { - FilterListData(iter.into_iter().collect::>()) - } -} - -impl TryFrom<&FilterList> for FilterListData { - type Error = crate::error::Error; - fn try_from(filters: &FilterList) -> TileDBResult { - filters - .to_vec()? - .into_iter() - .map(|f| FilterData::try_from(&f)) - .collect::>() - } -} - -impl TryFrom for FilterListData { - type Error = crate::error::Error; - - fn try_from(filters: FilterList) -> TileDBResult { - Self::try_from(&filters) - } -} - -impl crate::Factory for FilterListData { - type Item = FilterList; - - fn create(&self, context: &Context) -> TileDBResult { - Ok(self - .iter() - .fold(Builder::new(context), |b, filter| { - b?.add_filter_data(filter) - })? - .build()) - } -} - #[cfg(test)] mod test { use super::*; diff --git a/tiledb/api/src/filter/mod.rs b/tiledb/api/src/filter/mod.rs index 73893159..b8512640 100644 --- a/tiledb/api/src/filter/mod.rs +++ b/tiledb/api/src/filter/mod.rs @@ -1,292 +1,25 @@ +mod ftype; +pub mod list; +mod option; + use std::borrow::Borrow; use std::fmt::{Debug, Formatter, Result as FmtResult}; use std::ops::Deref; -use serde::{Deserialize, Serialize}; -use util::option::OptionSubset; - +use self::ftype::FilterType; +use self::option::FilterOption; use crate::context::{CApiInterface, Context, ContextBound}; -use crate::error::{DatatypeErrorKind, Error}; use crate::{Datatype, Result as TileDBResult}; -pub mod list; -pub mod webp; - -pub use crate::filter::list::{Builder as FilterListBuilder, FilterList}; -pub use crate::filter::webp::WebPFilterInputFormat; - -mod ftype; -mod option; - -use crate::filter::ftype::FilterType; -use crate::filter::option::FilterOption; - -#[derive( - Copy, Clone, Debug, Deserialize, Eq, OptionSubset, PartialEq, Serialize, -)] -pub enum CompressionType { - Bzip2, - Dictionary, - Gzip, - Lz4, - Rle, - Zstd, - Delta { - reinterpret_datatype: Option, - }, - DoubleDelta { - reinterpret_datatype: Option, - }, -} - -#[derive( - Copy, Clone, Debug, Deserialize, Eq, OptionSubset, PartialEq, Serialize, -)] -pub enum ChecksumType { - Md5, - Sha256, -} - -#[derive(Clone, Debug, Deserialize, OptionSubset, PartialEq, Serialize)] -pub struct CompressionData { - pub kind: CompressionType, - pub level: Option, -} - -impl CompressionData { - pub fn new(kind: CompressionType) -> Self { - CompressionData { kind, level: None } - } -} - -#[derive( - Clone, Copy, Debug, Default, Deserialize, OptionSubset, PartialEq, Serialize, -)] -pub enum ScaleFloatByteWidth { - I8, - I16, - I32, - #[default] // keep in sync with tiledb/sm/filter/float_scaling_filter.h - I64, -} - -impl ScaleFloatByteWidth { - pub(crate) fn capi_enum(&self) -> usize { - match *self { - Self::I8 => std::mem::size_of::(), - Self::I16 => std::mem::size_of::(), - Self::I32 => std::mem::size_of::(), - Self::I64 => std::mem::size_of::(), - } - } - - pub fn output_datatype(&self) -> Datatype { - match *self { - Self::I8 => Datatype::Int8, - Self::I16 => Datatype::Int16, - Self::I32 => Datatype::Int32, - Self::I64 => Datatype::Int64, - } - } -} - -impl TryFrom for ScaleFloatByteWidth { - type Error = crate::error::Error; - fn try_from(value: std::ffi::c_ulonglong) -> TileDBResult { - match value { - 1 => Ok(Self::I8), - 2 => Ok(Self::I16), - 4 => Ok(Self::I32), - 8 => Ok(Self::I64), - v => Err(Self::Error::LibTileDB(format!( - "Invalid scale float byte width: {}", - v - ))), - } - } -} - -#[derive(Clone, Debug, Deserialize, OptionSubset, PartialEq, Serialize)] -pub enum FilterData { - None, - BitShuffle, - ByteShuffle, - BitWidthReduction { - max_window: Option, - }, - Checksum(ChecksumType), - Compression(CompressionData), - PositiveDelta { - max_window: Option, - }, - ScaleFloat { - byte_width: Option, - factor: Option, - offset: Option, - }, - WebP { - input_format: WebPFilterInputFormat, - lossless: Option, - quality: Option, - }, - Xor, -} - -impl FilterData { - pub fn construct(&self, context: &Context) -> TileDBResult { - Filter::create(context, self) - } - - pub fn get_type(&self) -> FilterType { - match *self { - FilterData::None => FilterType::None, - FilterData::BitShuffle { .. } => FilterType::BitShuffle, - FilterData::ByteShuffle { .. } => FilterType::ByteShuffle, - FilterData::BitWidthReduction { .. } => { - FilterType::BitWidthReduction - } - FilterData::Checksum(ChecksumType::Md5) => FilterType::ChecksumMD5, - FilterData::Checksum(ChecksumType::Sha256) => { - FilterType::ChecksumSHA256 - } - FilterData::Compression(CompressionData { - kind: CompressionType::Bzip2, - .. - }) => FilterType::Bzip2, - FilterData::Compression(CompressionData { - kind: CompressionType::Delta { .. }, - .. - }) => FilterType::Delta, - FilterData::Compression(CompressionData { - kind: CompressionType::Dictionary, - .. - }) => FilterType::Dictionary, - FilterData::Compression(CompressionData { - kind: CompressionType::DoubleDelta { .. }, - .. - }) => FilterType::DoubleDelta, - FilterData::Compression(CompressionData { - kind: CompressionType::Gzip, - .. - }) => FilterType::Gzip, - FilterData::Compression(CompressionData { - kind: CompressionType::Lz4, - .. - }) => FilterType::Lz4, - FilterData::Compression(CompressionData { - kind: CompressionType::Rle, - .. - }) => FilterType::Rle, - FilterData::Compression(CompressionData { - kind: CompressionType::Zstd, - .. - }) => FilterType::Zstd, - FilterData::PositiveDelta { .. } => FilterType::PositiveDelta, - FilterData::ScaleFloat { .. } => FilterType::ScaleFloat, - FilterData::WebP { .. } => FilterType::WebP, - FilterData::Xor => FilterType::Xor, - } - } - - /// Returns the output datatype when this filter is applied to the input type. - /// If the filter cannot accept the requested input type, None is returned. - pub fn transform_datatype(&self, input: &Datatype) -> Option { - /* - * Note to developers, this code should be kept in sync with - * tiledb/sm/filters/filter/ functions - * - `accepts_input_datatype` - * - `output_datatype` - * - * Those functions are not part of the external C API. - */ - match *self { - FilterData::None => Some(*input), - FilterData::BitShuffle => Some(*input), - FilterData::ByteShuffle => Some(*input), - FilterData::Checksum(_) => Some(*input), - FilterData::BitWidthReduction { .. } - | FilterData::PositiveDelta { .. } => { - if input.is_integral_type() - || input.is_datetime_type() - || input.is_time_type() - || input.is_byte_type() - { - Some(*input) - } else { - None - } - } - FilterData::Compression(CompressionData { kind, .. }) => match kind - { - CompressionType::Delta { - reinterpret_datatype, - } - | CompressionType::DoubleDelta { - reinterpret_datatype, - } => reinterpret_datatype.map_or(Some(*input), |dtype| { - if !dtype.is_real_type() { - Some(dtype) - } else { - None - } - }), - _ => Some(*input), - }, - FilterData::ScaleFloat { byte_width, .. } => { - let input_size = input.size() as usize; - if input_size == std::mem::size_of::() - || input_size == std::mem::size_of::() - { - Some( - byte_width - .unwrap_or(ScaleFloatByteWidth::default()) - .output_datatype(), - ) - } else { - None - } - } - FilterData::WebP { .. } => { - if *input == Datatype::UInt8 { - Some(Datatype::UInt8) - } else { - None - } - } - FilterData::Xor => match input.size() { - 1 => Some(Datatype::Int8), - 2 => Some(Datatype::Int16), - 4 => Some(Datatype::Int32), - 8 => Some(Datatype::Int64), - _ => None, - }, - } - } -} - -impl TryFrom<&Filter> for FilterData { - type Error = crate::error::Error; +pub use self::ftype::Error as FilterTypeError; +pub use self::list::{Builder as FilterListBuilder, FilterList}; +pub use self::option::Error as FilterOptionError; - fn try_from(filter: &Filter) -> TileDBResult { - filter.filter_data() - } -} - -impl TryFrom for FilterData { - type Error = crate::error::Error; - - fn try_from(filter: Filter) -> TileDBResult { - Self::try_from(&filter) - } -} - -impl crate::Factory for FilterData { - type Item = Filter; - - fn create(&self, context: &Context) -> TileDBResult { - Filter::create(context, self) - } -} +pub use tiledb_common::filter::{ + ChecksumType, CompressionData, CompressionType, FilterData, + ScaleFloatByteWidth, ScaleFloatByteWidthError, WebPFilterError, + WebPFilterInputFormat, +}; pub(crate) enum RawFilter { Owned(*mut ffi::tiledb_filter_t), @@ -337,7 +70,8 @@ impl Filter { { let filter_data = filter_data.borrow(); let mut c_filter: *mut ffi::tiledb_filter_t = out_ptr!(); - let ftype = filter_data.get_type().capi_enum(); + let ftype = + ffi::tiledb_filter_type_t::from(FilterType::from(filter_data)); context.capi_call(|ctx| unsafe { ffi::tiledb_filter_alloc(ctx, ftype, &mut c_filter) })?; @@ -380,8 +114,9 @@ impl Filter { | CompressionType::DoubleDelta { reinterpret_datatype: Some(reinterpret_datatype), } => { - let c_datatype = reinterpret_datatype.capi_enum() - as std::ffi::c_uchar; + let c_datatype = + ffi::tiledb_datatype_t::from(reinterpret_datatype) + as std::ffi::c_uchar; Self::set_option( context, *raw, @@ -409,7 +144,7 @@ impl Filter { offset, } => { if let Some(byte_width) = byte_width { - let c_width = byte_width.capi_enum(); + let c_width = std::ffi::c_ulonglong::from(byte_width); Self::set_option( context, *raw, @@ -445,7 +180,8 @@ impl Filter { } => { { let c_format = - input_format.capi_enum() as std::ffi::c_uchar; + ffi::tiledb_filter_webp_format_t::from(input_format) + as std::ffi::c_uchar; Self::set_option( context, *raw, @@ -512,14 +248,7 @@ impl Filter { let dtype = self.get_option::( FilterOption::CompressionReinterpretDatatype, )?; - Datatype::try_from(dtype as ffi::tiledb_datatype_t) - .map_err(|_| { - Error::Datatype( - DatatypeErrorKind::InvalidDiscriminant( - dtype as u64, - ), - ) - })? + Datatype::try_from(dtype as ffi::tiledb_datatype_t)? }); if FilterType::try_from(c_ftype).unwrap() == FilterType::Delta { get_compression_data(CompressionType::Delta { @@ -585,7 +314,7 @@ impl Filter { fn get_option(&self, fopt: FilterOption) -> TileDBResult { let c_filter = self.capi(); - let c_opt = fopt.capi_enum(); + let c_opt = ffi::tiledb_filter_option_t::from(fopt); let mut val: T = out_ptr!(); self.capi_call(|ctx| unsafe { ffi::tiledb_filter_get_option( @@ -604,7 +333,7 @@ impl Filter { fopt: FilterOption, val: T, ) -> TileDBResult<()> { - let c_opt = fopt.capi_enum(); + let c_opt = ffi::tiledb_filter_option_t::from(fopt); let c_val = &val as *const T as *const std::ffi::c_void; context.capi_call(|ctx| unsafe { ffi::tiledb_filter_set_option(ctx, raw, c_opt, c_val) @@ -634,12 +363,22 @@ impl PartialEq for Filter { #[cfg(feature = "arrow")] pub mod arrow; -#[cfg(any(test, feature = "proptest-strategies"))] -pub mod strategy; +#[cfg(any(test, feature = "pod"))] +pub mod pod; #[cfg(test)] mod tests { + use std::rc::Rc; + + use proptest::prelude::*; + use tiledb_common::filter::FilterData; + use tiledb_pod::filter::strategy::{ + prop_filter, FilterPipelineStrategy, Requirements, + }; + use utils::assert_option_subset; + use super::*; + use crate::Factory; /// Ensure that we can construct a filter from all options using default settings #[test] @@ -853,4 +592,98 @@ mod tests { _ => unreachable!(), } } + + #[test] + /// Test that the arbitrary filter construction always succeeds + fn filter_arbitrary() { + let ctx = Context::new().expect("Error creating context"); + + proptest!(|(filt in FilterPipelineStrategy::default())| { + filt.create(&ctx).expect("Error constructing arbitrary filter"); + }); + } + + /// Test that the arbitrary filter construction always succeeds with a + /// supplied datatype + #[test] + fn filter_arbitrary_for_datatype() { + let ctx = Context::new().expect("Error creating context"); + + let strat = any::().prop_flat_map(|dt| { + ( + Just(dt), + prop_filter(Rc::new(Requirements { + input_datatype: Some(dt), + ..Default::default() + })), + ) + }); + + proptest!(|((dt, filt) in strat)| { + let filt = filt.create(&ctx) + .expect("Error constructing arbitrary filter"); + + let filt_data = filt.filter_data() + .expect("Error reading filter data"); + assert!(filt_data.transform_datatype(&dt).is_some()); + }); + } + + #[test] + /// Test that the arbitrary filter list construction always succeeds + fn filter_list_arbitrary() { + let ctx = Context::new().expect("Error creating context"); + + proptest!(|(fl in FilterPipelineStrategy::default())| { + fl.create(&ctx).expect("Error constructing arbitrary filter list"); + }); + } + + #[test] + /// Test that the arbitrary filter list construction always succeeds with a + /// supplied datatype + fn filter_list_arbitrary_for_datatype() { + let ctx = Context::new().expect("Error creating context"); + + let strat = any::().prop_flat_map(|dt| { + let req = Rc::new(Requirements { + input_datatype: Some(dt), + ..Default::default() + }); + (Just(dt), FilterPipelineStrategy::new(req)) + }); + + proptest!(|((dt, fl) in strat)| { + let fl = fl.create(&ctx) + .expect("Error constructing arbitrary filter"); + + let mut current_dt = dt; + + let fl = fl.to_vec().expect("Error collecting filters"); + for (fi, f) in fl.iter().enumerate() { + if let Some(next_dt) = f.filter_data() + .expect("Error reading filter data") + .transform_datatype(¤t_dt) { + current_dt = next_dt + } else { + panic!("Constructed invalid filter list for datatype {}: \ + {:?}, invalid at position {}", dt, fl, fi) + } + } + }); + } + + #[test] + fn filter_eq_reflexivity() { + let ctx = Context::new().expect("Error creating context"); + + proptest!(|(pipeline in FilterPipelineStrategy::default())| { + assert_eq!(pipeline, pipeline); + assert_option_subset!(pipeline, pipeline); + + let pipeline = pipeline.create(&ctx) + .expect("Error constructing arbitrary filter"); + assert_eq!(pipeline, pipeline); + }); + } } diff --git a/tiledb/api/src/filter/option.rs b/tiledb/api/src/filter/option.rs index b2439c98..43e2b0c0 100644 --- a/tiledb/api/src/filter/option.rs +++ b/tiledb/api/src/filter/option.rs @@ -1,8 +1,16 @@ -use crate::Result as TileDBResult; +use thiserror::Error; +#[cfg(feature = "serde")] use serde::{Deserialize, Serialize}; -#[derive(Clone, Debug, PartialEq, Deserialize, Serialize)] +#[derive(Clone, Debug, Error)] +pub enum Error { + #[error("Invalid discriminant for {}: {0}", std::any::type_name::())] + InvalidDiscriminant(u64), +} + +#[derive(Copy, Clone, Debug, Eq, PartialEq)] +#[cfg_attr(feature = "serde", derive(Deserialize, Serialize))] pub enum FilterOption { CompressionLevel, BitWidthMaxWindow, @@ -16,9 +24,9 @@ pub enum FilterOption { CompressionReinterpretDatatype, } -impl FilterOption { - pub(crate) fn capi_enum(&self) -> ffi::tiledb_filter_option_t { - let ffi_enum = match *self { +impl From for ffi::tiledb_filter_option_t { + fn from(value: FilterOption) -> Self { + let ffi_enum = match value { FilterOption::CompressionLevel => { ffi::tiledb_filter_option_t_TILEDB_COMPRESSION_LEVEL }, @@ -55,8 +63,8 @@ impl FilterOption { } impl TryFrom for FilterOption { - type Error = crate::error::Error; - fn try_from(value: u32) -> TileDBResult { + type Error = Error; + fn try_from(value: u32) -> Result { match value { ffi::tiledb_filter_option_t_TILEDB_COMPRESSION_LEVEL => { Ok(FilterOption::CompressionLevel) @@ -74,10 +82,7 @@ impl TryFrom for FilterOption { ffi::tiledb_filter_option_t_TILEDB_WEBP_INPUT_FORMAT => Ok(FilterOption::WebPInputFormat), ffi::tiledb_filter_option_t_TILEDB_WEBP_LOSSLESS => Ok(FilterOption::WebPLossless), ffi::tiledb_filter_option_t_TILEDB_COMPRESSION_REINTERPRET_DATATYPE => Ok(FilterOption::CompressionReinterpretDatatype), - _ => Err(Self::Error::LibTileDB(format!( - "Invalid filter option type: {}", - value - ))), + _ => Err(Error::InvalidDiscriminant(value as u64)) } } } @@ -91,8 +96,9 @@ mod tests { let mut ok = 0; for i in 0..256 { let fopt = FilterOption::try_from(i); - if fopt.is_ok() { + if let Ok(fopt) = fopt { ok += 1; + assert_eq!(i, ffi::tiledb_filter_option_t::from(fopt)); } } assert_eq!(ok, 10); diff --git a/tiledb/api/src/filter/pod.rs b/tiledb/api/src/filter/pod.rs new file mode 100644 index 00000000..bc11af8c --- /dev/null +++ b/tiledb/api/src/filter/pod.rs @@ -0,0 +1,62 @@ +use tiledb_common::filter::FilterData; + +use super::{Filter, FilterList, FilterListBuilder}; +use crate::error::Error as TileDBError; +use crate::{Context, Result as TileDBResult}; + +impl TryFrom<&Filter> for FilterData { + type Error = TileDBError; + + fn try_from(filter: &Filter) -> Result { + filter.filter_data() + } +} + +impl TryFrom for FilterData { + type Error = TileDBError; + + fn try_from(filter: Filter) -> Result { + filter.filter_data() + } +} + +impl crate::Factory for FilterData { + type Item = Filter; + + fn create(&self, context: &Context) -> TileDBResult { + Filter::create(context, self) + } +} + +impl TryFrom<&FilterList> for Vec { + type Error = crate::error::Error; + + fn try_from(pipeline: &FilterList) -> Result { + pipeline + .to_vec()? + .into_iter() + .map(FilterData::try_from) + .collect::>() + } +} + +impl TryFrom for Vec { + type Error = TileDBError; + + fn try_from(pipeline: FilterList) -> Result { + Self::try_from(&pipeline) + } +} + +impl crate::Factory for Vec { + type Item = FilterList; + + fn create(&self, context: &Context) -> TileDBResult { + Ok(self + .iter() + .fold(FilterListBuilder::new(context), |b, filter| { + b?.add_filter_data(filter) + })? + .build()) + } +} diff --git a/tiledb/api/src/group/mod.rs b/tiledb/api/src/group/mod.rs index d9f0b372..89e7ad1f 100644 --- a/tiledb/api/src/group/mod.rs +++ b/tiledb/api/src/group/mod.rs @@ -4,6 +4,7 @@ use crate::config::{Config, RawConfig}; use crate::context::{CApiInterface, ContextBound, ObjectType}; use crate::error::Error; use crate::key::LookupKey; +use crate::metadata; use crate::{Context, Datatype}; extern crate tiledb_sys as ffi; @@ -100,7 +101,7 @@ impl Group { } let raw_group = RawGroup::new(group_raw); - let query_type_raw = query_type.capi_enum(); + let query_type_raw = ffi::tiledb_query_type_t::from(query_type); context.capi_call(|ctx| unsafe { ffi::tiledb_group_open(ctx, group_raw, query_type_raw) })?; @@ -135,7 +136,7 @@ impl Group { self.capi_call(|ctx| unsafe { ffi::tiledb_group_get_query_type(ctx, c_group, &mut c_type) })?; - QueryType::try_from(c_type) + Ok(QueryType::try_from(c_type)?) } // Deletes the group itself. Can only be called once. @@ -328,7 +329,8 @@ impl Group { pub fn put_metadata(&mut self, metadata: Metadata) -> TileDBResult<()> { let c_group = self.capi(); - let (vec_size, vec_ptr, datatype) = metadata.c_data(); + let (vec_size, vec_ptr, datatype) = + metadata::metadata_to_ffi(&metadata); let c_key = cstring!(metadata.key); self.capi_call(|ctx| unsafe { ffi::tiledb_group_put_metadata( @@ -336,7 +338,7 @@ impl Group { c_group, c_key.as_ptr(), datatype, - vec_size as u32, + vec_size, vec_ptr, ) })?; @@ -408,7 +410,11 @@ impl Group { } }?; let datatype = Datatype::try_from(c_datatype)?; - Ok(Metadata::new_raw(name, datatype, vec_ptr, vec_size)) + Ok(metadata::metadata_from_ffi( + name, + datatype, + (vec_size, vec_ptr), + )) } pub fn has_metadata_key(&self, name: S) -> TileDBResult> @@ -513,11 +519,11 @@ mod tests { context::ObjectType, Result as TileDBResult, }; - use tiledb_test_utils::{self, TestArrayUri}; + use uri::{self, TestArrayUri}; #[test] fn test_group_metadata() -> TileDBResult<()> { - let test_uri = tiledb_test_utils::get_uri_generator() + let test_uri = uri::get_uri_generator() .map_err(|e| Error::Other(e.to_string()))?; let tdb = Context::new()?; @@ -652,7 +658,7 @@ mod tests { #[test] fn test_group_functionality() -> TileDBResult<()> { - let test_uri = tiledb_test_utils::get_uri_generator() + let test_uri = uri::get_uri_generator() .map_err(|e| Error::Other(e.to_string()))?; let tdb = Context::new()?; @@ -732,7 +738,7 @@ mod tests { #[test] fn test_group_config() -> TileDBResult<()> { - let test_uri = tiledb_test_utils::get_uri_generator() + let test_uri = uri::get_uri_generator() .map_err(|e| Error::Other(e.to_string()))?; let tdb = Context::new()?; diff --git a/tiledb/api/src/lib.rs b/tiledb/api/src/lib.rs index 518388f0..3c960638 100644 --- a/tiledb/api/src/lib.rs +++ b/tiledb/api/src/lib.rs @@ -1,11 +1,9 @@ extern crate anyhow; -extern crate serde; -extern crate serde_json; extern crate thiserror; -#[macro_use] -extern crate tiledb_proc_macro; extern crate tiledb_sys as ffi; -extern crate tiledb_utils as util; + +#[cfg(test)] +extern crate tiledb_utils as utils; macro_rules! cstring { ($arg:expr) => { @@ -40,6 +38,10 @@ macro_rules! out_ptr { }; } +pub use tiledb_common::key; +pub use tiledb_common::physical_type_go; +pub use tiledb_common::range; + pub mod array; pub mod config; pub mod context; @@ -48,11 +50,8 @@ pub mod error; pub mod filesystem; pub mod filter; pub mod group; -pub mod key; pub mod metadata; pub mod query; -#[macro_use] -pub mod range; pub mod stats; pub mod string; pub mod vfs; @@ -80,32 +79,9 @@ pub use context::{Context, ContextBound}; pub use datatype::Datatype; pub type Result = std::result::Result; +#[cfg(any(test, feature = "pod"))] pub trait Factory { type Item; fn create(&self, context: &context::Context) -> Result; } - -mod private { - // The "sealed trait" pattern is a way to prevent downstream crates from implementing traits - // that you don't think they should implement. If you have `trait Foo: Sealed`, then - // downstream crates cannot `impl Foo` because they cannot `impl Sealed`. - // - // Semantic versioning is one reason you might want this. - // We currently use this as a bound for `datatype::PhysicalType` and `datatype::LogicalType` - // so that we won't accept something that we don't know about for the C API calls. - pub trait Sealed {} - - macro_rules! sealed { - ($($DT:ty),+) => { - $( - impl crate::private::Sealed for $DT {} - )+ - } - } - - pub(crate) use sealed; -} - -#[cfg(any(test, feature = "proptest-strategies"))] -pub mod strategy; diff --git a/tiledb/api/src/metadata.rs b/tiledb/api/src/metadata.rs index 5993874e..d6e7e6e3 100644 --- a/tiledb/api/src/metadata.rs +++ b/tiledb/api/src/metadata.rs @@ -1,363 +1,44 @@ -use crate::datatype::Datatype; -use crate::error::DatatypeErrorKind; -use crate::physical_type_go; -use crate::Result as TileDBResult; -use core::slice; -use std::convert::From; - -use serde::{Deserialize, Serialize}; -use util::option::OptionSubset; - -#[derive(Clone, Debug, Deserialize, OptionSubset, PartialEq, Serialize)] -pub enum Value { - UInt8Value(Vec), - UInt16Value(Vec), - UInt32Value(Vec), - UInt64Value(Vec), - Int8Value(Vec), - Int16Value(Vec), - Int32Value(Vec), - Int64Value(Vec), - Float32Value(Vec), - Float64Value(Vec), -} - -fn get_value_vec(vec: &[T]) -> (*const std::ffi::c_void, usize) { - let vec_size = vec.len(); - let vec_ptr = vec.as_ptr() as *const std::ffi::c_void; - (vec_ptr, vec_size) -} - -/// Applies a generic expression to the interior of a `Value`. -/// -/// # Examples -/// ``` -/// use tiledb::metadata::Value; -/// use tiledb::value_go; -/// -/// fn truncate(v: &mut Value, len: usize) { -/// value_go!(v, _DT, ref mut v_inner, v_inner.truncate(len)); -/// } -/// -/// let mut v = Value::UInt64Value(vec![0, 24, 48]); -/// truncate(&mut v, 2); -/// assert_eq!(v, Value::UInt64Value(vec![0, 24])); -/// ``` -#[macro_export] -macro_rules! value_go { - ($valuetype:expr, $typename:ident, $vec:pat, $then: expr) => {{ - use $crate::metadata::Value; - match $valuetype { - Value::Int8Value($vec) => { - type $typename = i8; - $then - } - Value::Int16Value($vec) => { - type $typename = i16; - $then - } - Value::Int32Value($vec) => { - type $typename = i32; - $then - } - Value::Int64Value($vec) => { - type $typename = i64; - $then - } - Value::UInt8Value($vec) => { - type $typename = u8; - $then - } - Value::UInt16Value($vec) => { - type $typename = u16; - $then - } - Value::UInt32Value($vec) => { - type $typename = u32; - $then - } - Value::UInt64Value($vec) => { - type $typename = u64; - $then - } - Value::Float32Value($vec) => { - type $typename = f32; - $then - } - Value::Float64Value($vec) => { - type $typename = f64; - $then - } - } - }}; -} -pub use value_go; - -/// Applies a generic expression to the interiors of two `Value`s with matching variants, -/// i.e. with the same physical data type. Typical usage is for comparing the insides of the two -/// `Value`s. -#[macro_export] -macro_rules! value_cmp { - ($lexpr:expr, $rexpr:expr, $typename:ident, $lpat:pat, $rpat:pat, $same_type:expr, $else:expr) => {{ - use $crate::metadata::Value; - match ($lexpr, $rexpr) { - (Value::Int8Value($lpat), Value::Int8Value($rpat)) => { - type $typename = i8; - $same_type - } - (Value::Int16Value($lpat), Value::Int16Value($rpat)) => { - type $typename = i16; - $same_type - } - (Value::Int32Value($lpat), Value::Int32Value($rpat)) => { - type $typename = i32; - $same_type - } - (Value::Int64Value($lpat), Value::Int64Value($rpat)) => { - type $typename = i64; - $same_type - } - (Value::UInt8Value($lpat), Value::UInt8Value($rpat)) => { - type $typename = u8; - $same_type - } - (Value::UInt16Value($lpat), Value::UInt16Value($rpat)) => { - type $typename = u16; - $same_type - } - (Value::UInt32Value($lpat), Value::UInt32Value($rpat)) => { - type $typename = u32; - $same_type - } - (Value::UInt64Value($lpat), Value::UInt64Value($rpat)) => { - type $typename = u64; - $same_type - } - (Value::Float32Value($lpat), Value::Float32Value($rpat)) => { - type $typename = f32; - $same_type - } - (Value::Float64Value($lpat), Value::Float64Value($rpat)) => { - type $typename = f64; - $same_type - } - _ => $else, - } - }}; -} - -impl Value { - pub(crate) fn c_vec(&self) -> (*const std::ffi::c_void, usize) { - value_go!(self, _DT, ref vec, get_value_vec(vec)) - } - - pub fn len(&self) -> usize { - value_go!(self, _DT, ref v, v.len()) - } - - pub fn is_empty(&self) -> bool { - value_go!(self, _DT, ref v, v.is_empty()) - } -} - -macro_rules! value_impl { - ($ty:ty, $constructor:expr) => { - impl From> for Value { - fn from(vec: Vec<$ty>) -> Self { - $constructor(vec) - } - } - }; -} - -value_impl!(i8, Value::Int8Value); -value_impl!(i16, Value::Int16Value); -value_impl!(i32, Value::Int32Value); -value_impl!(i64, Value::Int64Value); -value_impl!(u8, Value::UInt8Value); -value_impl!(u16, Value::UInt16Value); -value_impl!(u32, Value::UInt32Value); -value_impl!(u64, Value::UInt64Value); -value_impl!(f32, Value::Float32Value); -value_impl!(f64, Value::Float64Value); - -#[derive(Clone, Debug, PartialEq)] -pub struct Metadata { - pub key: String, - pub datatype: Datatype, - pub value: Value, -} - -impl Metadata { - pub fn new( - key: String, - datatype: Datatype, - vec: Vec, - ) -> TileDBResult - where - Value: From>, - T: 'static, - { - if !datatype.is_compatible_type::() { - return Err(crate::error::Error::Datatype( - DatatypeErrorKind::TypeMismatch { - user_type: std::any::type_name::().to_owned(), - tiledb_type: datatype, - }, - )); - } - Ok(Metadata { - key, - datatype, - value: Value::from(vec), - }) - } - - pub(crate) fn new_raw( - key: String, - datatype: Datatype, - vec_ptr: *const std::ffi::c_void, - vec_size: u32, - ) -> Self { - let value = physical_type_go!(datatype, DT, { - let vec_slice = unsafe { - slice::from_raw_parts( - vec_ptr as *const DT, - vec_size.try_into().unwrap(), - ) - }; - let vec_value: Vec
= vec_slice.to_vec(); - Value::from(vec_value) +use tiledb_common::datatype::Datatype; +use tiledb_common::physical_type_go; + +pub use tiledb_common::metadata::*; +pub use tiledb_common::metadata_value_go; + +pub(crate) fn metadata_to_ffi( + metadata: &Metadata, +) -> (u32, *const std::ffi::c_void, ffi::tiledb_datatype_t) { + let (vec_size, vec_ptr) = + metadata_value_go!(metadata.value, _DT, ref contents, { + (contents.len(), contents.as_ptr() as *const std::ffi::c_void) }); - Metadata { - key, - datatype, - value, - } - } - - pub(crate) fn c_data( - &self, - ) -> (usize, *const std::ffi::c_void, ffi::tiledb_datatype_t) { - let (vec_ptr, vec_size) = self.value.c_vec(); - let c_datatype = self.datatype.capi_enum(); - (vec_size, vec_ptr, c_datatype) - } -} - -#[cfg(any(test, feature = "proptest-strategies"))] -pub mod strategy { - use super::*; - use proptest::collection::{vec, SizeRange}; - use proptest::prelude::*; - - use crate::datatype::strategy::DatatypeContext; - - pub struct Requirements { - key: BoxedStrategy, - datatype: BoxedStrategy, - value_length: SizeRange, - } - - impl Requirements { - const DEFAULT_VALUE_LENGTH_MIN: usize = 1; // SC-48955 - const DEFAULT_VALUE_LENGTH_MAX: usize = 64; - } - - impl Default for Requirements { - fn default() -> Self { - Requirements { - key: any::().boxed(), - datatype: any_with::(DatatypeContext::NotAny).boxed(), - value_length: (Self::DEFAULT_VALUE_LENGTH_MIN - ..=Self::DEFAULT_VALUE_LENGTH_MAX) - .into(), + let c_datatype = ffi::tiledb_datatype_t::from(metadata.datatype); + (vec_size as u32, vec_ptr, c_datatype) +} + +pub(crate) fn metadata_from_ffi( + key: String, + datatype: Datatype, + ffi: (u32, *const std::ffi::c_void), +) -> Metadata { + let value = physical_type_go!(datatype, DT, { + let slice = { + let vec_ptr = if ffi.0 == 0 { + std::ptr::NonNull::
::dangling().as_ptr() + as *const std::ffi::c_void + } else { + ffi.1 + }; + unsafe { + std::slice::from_raw_parts(vec_ptr as *const DT, ffi.0 as usize) } - } - } - - impl Arbitrary for Metadata { - type Parameters = Requirements; - type Strategy = BoxedStrategy; - - fn arbitrary_with(params: Self::Parameters) -> Self::Strategy { - params - .datatype - .prop_flat_map(move |dt| { - let value_strat = physical_type_go!(dt, DT, { - vec(any::
(), params.value_length.clone()) - .prop_map(Value::from) - .boxed() - }); - (params.key.clone(), Just(dt), value_strat) - }) - .prop_map(|(key, datatype, value)| Metadata { - key, - datatype, - value, - }) - .boxed() - } - } - - #[cfg(test)] - mod tests { - use tiledb_test_utils::TestArrayUri; - - use super::*; - use crate::array::{Array, Mode}; - use crate::Context; - - #[test] - fn arbitrary_put() { - let test_uri = tiledb_test_utils::get_uri_generator().unwrap(); - let uri = test_uri.with_path("quickstart_dense").unwrap(); - - let c: Context = Context::new().unwrap(); - crate::array::tests::create_quickstart_dense(&test_uri, &c) - .unwrap(); - - proptest!(move |(m_in in any::())| { - // write - { - let mut a = Array::open(&c, &uri, Mode::Write).unwrap(); - a.put_metadata(m_in.clone()).expect("Error writing metadata"); - } - // read - { - let a = Array::open(&c, &uri, Mode::Read).unwrap(); - let m_out = a.metadata(m_in.key.clone()).expect("Error reading metadata"); - assert_eq!(m_in, m_out); - } - }); - } - } -} - -#[cfg(test)] -mod tests { - use super::*; - use proptest::prelude::*; - - fn do_value_cmp(m1: Metadata, m2: Metadata) { - if m1.datatype.same_physical_type(&m2.datatype) { - value_cmp!(&m1.value, &m2.value, _DT, _, _, - (), - unreachable!("Non-matching `Value` variants for same physical type: {:?} and {:?}", - m1, m2)); - } else { - value_cmp!(&m1.value, &m2.value, _DT, _, _, - unreachable!("Matching `Value` variants for different physical type: {:?} and {:?}", - m1, m2), - ()); - } - } + }; + Value::from(slice.to_vec()) + }); - proptest! { - #[test] - fn value_cmp((m1, m2) in (any::(), any::())) { - do_value_cmp(m1, m2) - } + Metadata { + key, + datatype, + value, } } diff --git a/tiledb/api/src/query/buffer/mod.rs b/tiledb/api/src/query/buffer/mod.rs index ec83167c..2426fc5b 100644 --- a/tiledb/api/src/query/buffer/mod.rs +++ b/tiledb/api/src/query/buffer/mod.rs @@ -493,6 +493,7 @@ query_buffers_proof_impls!( QueryBuffersCellStructureVar ); +#[derive(Debug)] pub enum TypedQueryBuffers<'data> { UInt8(QueryBuffers<'data, u8>), UInt16(QueryBuffers<'data, u16>), @@ -754,8 +755,8 @@ impl<'cell> RefTypedQueryBuffersMut<'cell, '_> { } } -#[cfg(any(test, feature = "proptest-strategies"))] -pub mod strategy { +#[cfg(test)] +pub mod tests { use proptest::collection::vec; use proptest::prelude::*; diff --git a/tiledb/api/src/query/conditions.rs b/tiledb/api/src/query/conditions.rs index 0b22c528..9b66252c 100644 --- a/tiledb/api/src/query/conditions.rs +++ b/tiledb/api/src/query/conditions.rs @@ -3,6 +3,8 @@ use std::hash::{Hash, Hasher}; use std::ops::{BitAnd, BitOr, Deref, Not}; use anyhow::anyhow; + +#[cfg(feature = "serde")] use serde::{Deserialize, Serialize}; use crate::context::Context; @@ -10,7 +12,8 @@ use crate::datatype::physical::{BitsEq, BitsHash}; use crate::error::Error; use crate::Result as TileDBResult; -#[derive(Clone, Copy, Debug, Deserialize, Eq, Hash, PartialEq, Serialize)] +#[derive(Clone, Copy, Debug, Eq, Hash, PartialEq)] +#[cfg_attr(feature = "serde", derive(Deserialize, Serialize))] pub enum EqualityOp { Less, LessEqual, @@ -46,7 +49,8 @@ impl Display for EqualityOp { } } -#[derive(Clone, Copy, Debug, Deserialize, Eq, Hash, PartialEq, Serialize)] +#[derive(Clone, Copy, Debug, Eq, Hash, PartialEq)] +#[cfg_attr(feature = "serde", derive(Deserialize, Serialize))] pub enum SetMembershipOp { In, NotIn, @@ -70,7 +74,8 @@ impl Display for SetMembershipOp { } } -#[derive(Clone, Copy, Debug, Deserialize, Eq, Hash, PartialEq, Serialize)] +#[derive(Clone, Copy, Debug, Eq, Hash, PartialEq)] +#[cfg_attr(feature = "serde", derive(Deserialize, Serialize))] pub enum NullnessOp { IsNull, NotNull, @@ -94,7 +99,8 @@ impl Display for NullnessOp { } } -#[derive(Clone, Copy, Debug, Deserialize, Eq, Hash, PartialEq, Serialize)] +#[derive(Clone, Copy, Debug, Eq, Hash, PartialEq)] +#[cfg_attr(feature = "serde", derive(Deserialize, Serialize))] pub enum CombinationOp { And, Or, @@ -122,7 +128,8 @@ impl Display for CombinationOp { } } -#[derive(Clone, Debug, Deserialize, Serialize)] +#[derive(Clone, Debug)] +#[cfg_attr(feature = "serde", derive(Deserialize, Serialize))] pub enum Literal { UInt8(u8), UInt16(u16), @@ -257,7 +264,8 @@ fn escape_string_literal(s: &str) -> impl Display + '_ { } // N.B. I initially tried slices here, but that breaks the Deserialize trait. -#[derive(Clone, Debug, Deserialize, Serialize)] +#[derive(Clone, Debug)] +#[cfg_attr(feature = "serde", derive(Deserialize, Serialize))] pub enum SetMembers { UInt8(Vec), UInt16(Vec), @@ -454,7 +462,8 @@ impl From<&[&str]> for SetMembers { } } -#[derive(Clone, Debug, Deserialize, Eq, Hash, PartialEq, Serialize)] +#[derive(Clone, Debug, Eq, Hash, PartialEq)] +#[cfg_attr(feature = "serde", derive(Deserialize, Serialize))] pub struct EqualityPredicate { field: String, op: EqualityOp, @@ -497,7 +506,8 @@ impl Display for EqualityPredicate { } } -#[derive(Clone, Debug, Deserialize, Eq, Hash, PartialEq, Serialize)] +#[derive(Clone, Debug, Eq, Hash, PartialEq)] +#[cfg_attr(feature = "serde", derive(Deserialize, Serialize))] pub struct SetMembershipPredicate { field: String, op: SetMembershipOp, @@ -608,7 +618,8 @@ impl Display for SetMembershipPredicate { } } -#[derive(Clone, Debug, Deserialize, Eq, Hash, PartialEq, Serialize)] +#[derive(Clone, Debug, Eq, Hash, PartialEq)] +#[cfg_attr(feature = "serde", derive(Deserialize, Serialize))] pub struct NullnessPredicate { field: String, op: NullnessOp, @@ -647,7 +658,8 @@ impl Display for NullnessPredicate { } } -#[derive(Clone, Debug, Deserialize, Eq, Hash, PartialEq, Serialize)] +#[derive(Clone, Debug, Eq, Hash, PartialEq)] +#[cfg_attr(feature = "serde", derive(Deserialize, Serialize))] pub enum Predicate { Equality(EqualityPredicate), SetMembership(SetMembershipPredicate), @@ -674,7 +686,8 @@ impl Display for Predicate { } } -#[derive(Clone, Debug, Deserialize, Eq, Hash, PartialEq, Serialize)] +#[derive(Clone, Debug, Eq, Hash, PartialEq)] +#[cfg_attr(feature = "serde", derive(Deserialize, Serialize))] pub struct Field { field: String, } @@ -763,7 +776,8 @@ impl Field { } } -#[derive(Clone, Debug, Deserialize, Eq, Hash, PartialEq, Serialize)] +#[derive(Clone, Debug, Eq, Hash, PartialEq)] +#[cfg_attr(feature = "serde", derive(Deserialize, Serialize))] pub enum QueryConditionExpr { Cond(Predicate), Comb { diff --git a/tiledb/api/src/query/mod.rs b/tiledb/api/src/query/mod.rs index f8cb3e39..94e2d519 100644 --- a/tiledb/api/src/query/mod.rs +++ b/tiledb/api/src/query/mod.rs @@ -10,9 +10,6 @@ pub mod read; pub mod subarray; pub mod write; -#[cfg(any(test, feature = "proptest-strategies"))] -pub mod strategy; - pub use self::conditions::QueryConditionExpr; pub use self::read::{ ReadBuilder, ReadQuery, ReadQueryBuilder, ReadStepOutput, TypedReadBuilder, @@ -25,6 +22,24 @@ use self::subarray::RawSubarray; pub type QueryType = crate::array::Mode; pub type QueryLayout = crate::array::CellOrder; +// TODO: this is basically just to patch things over +// to prevent conflicting impl errors (because PhysicalType +// comes from tiledb_common but the traits are defined in this crate) +// we will also split the query adapter stuff out of this crate +// but that will be more complicated +pub trait CellValue: tiledb_common::datatype::PhysicalType {} + +impl CellValue for u8 {} +impl CellValue for u16 {} +impl CellValue for u32 {} +impl CellValue for u64 {} +impl CellValue for i8 {} +impl CellValue for i16 {} +impl CellValue for i32 {} +impl CellValue for i64 {} +impl CellValue for f32 {} +impl CellValue for f64 {} + pub enum RawQuery { Owned(*mut ffi::tiledb_query_t), } @@ -56,7 +71,7 @@ pub trait Query { /// The Subarray is tied to the lifetime of the Query. /// /// ```compile_fail,E0505 - /// # use tiledb::query::{Query, QueryBase, Subarray}; + /// # use tiledb_api::query::{Query, QueryBase, Subarray}; /// fn invalid_use(query: QueryBase) { /// let subarray = query.subarray().unwrap(); /// drop(query); @@ -187,7 +202,7 @@ pub trait QueryBuilder: Sized { Self: Sized, { let c_query = **self.base().cquery(); - let c_layout = layout.capi_enum(); + let c_layout = ffi::tiledb_layout_t::from(layout); self.base().capi_call(|ctx| unsafe { ffi::tiledb_query_set_layout(ctx, c_query, c_layout) })?; @@ -269,7 +284,7 @@ impl QueryBuilder for BuilderBase { impl BuilderBase { fn new(array: Array, query_type: QueryType) -> TileDBResult { let c_array = **array.capi(); - let c_query_type = query_type.capi_enum(); + let c_query_type = ffi::tiledb_query_type_t::from(query_type); let mut c_query: *mut ffi::tiledb_query_t = out_ptr!(); array.capi_call(|ctx| unsafe { ffi::tiledb_query_alloc(ctx, c_array, c_query_type, &mut c_query) @@ -283,3 +298,29 @@ impl BuilderBase { }) } } + +pub trait ToReadQuery { + type ReadBuilder<'data, B> + where + Self: 'data; + + /// Prepares a read query to read the fields written by this operation + /// restricted to the subarray represented by this write. + fn attach_read<'data, B>( + &'data self, + b: B, + ) -> TileDBResult> + where + B: ReadQueryBuilder<'data>; +} + +pub trait ToWriteQuery { + /// Prepares a write query to insert data from this write. + fn attach_write<'data>( + &'data self, + b: WriteBuilder<'data>, + ) -> TileDBResult>; +} + +#[cfg(any(test, feature = "proptest-strategies"))] +pub mod strategy; diff --git a/tiledb/api/src/query/read/aggregate/mod.rs b/tiledb/api/src/query/read/aggregate/mod.rs index ed0c3698..312b3749 100644 --- a/tiledb/api/src/query/read/aggregate/mod.rs +++ b/tiledb/api/src/query/read/aggregate/mod.rs @@ -5,7 +5,6 @@ use ffi::{ tiledb_channel_operation_t, tiledb_channel_operator_t, tiledb_query_channel_t, }; -use std::any::type_name; use std::ffi::CString; use std::fmt::{Debug, Display, Formatter, Result as FmtResult}; use std::marker::PhantomData; @@ -13,7 +12,7 @@ use std::mem; use crate::array::{CellValNum, Schema}; use crate::datatype::PhysicalType; -use crate::error::Error as TileDBError; +use crate::error::{DatatypeError, Error as TileDBError}; use crate::{Datatype, Result as TileDBResult}; /// Describes an aggregate function to apply to an array @@ -320,10 +319,7 @@ pub trait AggregateQueryBuilder: QueryBuilder { agg_function.result_type(&self.base().array().schema()?)?; if !expected_type.is_compatible_type::() { return Err(TileDBError::Datatype( - crate::error::DatatypeErrorKind::TypeMismatch { - user_type: String::from(type_name::()), - tiledb_type: expected_type, - }, + DatatypeError::physical_type_incompatible::(expected_type), )); } @@ -501,10 +497,9 @@ impl AggregateQueryBuilder for AggregateBuilder where mod tests { use std::rc::Rc; - use tiledb_test_utils::TestArrayUri; + use uri::TestArrayUri; use super::*; - use crate::error::DatatypeErrorKind; use crate::tests::prelude::*; /// Initialize a quickstart array for aggregate testing. @@ -787,7 +782,7 @@ mod tests { .build()? }; - let test_uri = tiledb_test_utils::get_uri_generator() + let test_uri = uri::get_uri_generator() .map_err(|e| Error::Other(e.to_string()))?; let uri = test_uri @@ -834,19 +829,23 @@ mod tests { let e = try_apply!(AggregateFunction::Count, i64); assert!(matches!( e, - Err(TileDBError::Datatype(DatatypeErrorKind::TypeMismatch { - tiledb_type: Datatype::UInt64, - .. - })) + Err(TileDBError::Datatype( + DatatypeError::PhysicalTypeIncompatible { + logical_type: Datatype::UInt64, + .. + } + )) )); let e = try_apply!(AggregateFunction::Count, u32); assert!(matches!( e, - Err(TileDBError::Datatype(DatatypeErrorKind::TypeMismatch { - tiledb_type: Datatype::UInt64, - .. - })) + Err(TileDBError::Datatype( + DatatypeError::PhysicalTypeIncompatible { + logical_type: Datatype::UInt64, + .. + } + )) )); } @@ -856,20 +855,24 @@ mod tests { try_apply!(AggregateFunction::NullCount("a".to_owned()), i64); assert!(matches!( e, - Err(TileDBError::Datatype(DatatypeErrorKind::TypeMismatch { - tiledb_type: Datatype::UInt64, - .. - })) + Err(TileDBError::Datatype( + DatatypeError::PhysicalTypeIncompatible { + logical_type: Datatype::UInt64, + .. + } + )) )); let e = try_apply!(AggregateFunction::NullCount("a".to_owned()), u32); assert!(matches!( e, - Err(TileDBError::Datatype(DatatypeErrorKind::TypeMismatch { - tiledb_type: Datatype::UInt64, - .. - })) + Err(TileDBError::Datatype( + DatatypeError::PhysicalTypeIncompatible { + logical_type: Datatype::UInt64, + .. + } + )) )); } @@ -878,19 +881,23 @@ mod tests { let e = try_apply!(AggregateFunction::Min("a".to_owned()), i64); assert!(matches!( e, - Err(TileDBError::Datatype(DatatypeErrorKind::TypeMismatch { - tiledb_type: Datatype::Int32, - .. - })) + Err(TileDBError::Datatype( + DatatypeError::PhysicalTypeIncompatible { + logical_type: Datatype::Int32, + .. + } + )) )); let e = try_apply!(AggregateFunction::Max("a".to_owned()), u32); assert!(matches!( e, - Err(TileDBError::Datatype(DatatypeErrorKind::TypeMismatch { - tiledb_type: Datatype::Int32, - .. - })) + Err(TileDBError::Datatype( + DatatypeError::PhysicalTypeIncompatible { + logical_type: Datatype::Int32, + .. + } + )) )); } @@ -899,19 +906,23 @@ mod tests { let e = try_apply!(AggregateFunction::Sum("a".to_owned()), u64); assert!(matches!( e, - Err(TileDBError::Datatype(DatatypeErrorKind::TypeMismatch { - tiledb_type: Datatype::Int64, - .. - })) + Err(TileDBError::Datatype( + DatatypeError::PhysicalTypeIncompatible { + logical_type: Datatype::Int64, + .. + } + )) )); let e = try_apply!(AggregateFunction::Sum("a".to_owned()), i32); assert!(matches!( e, - Err(TileDBError::Datatype(DatatypeErrorKind::TypeMismatch { - tiledb_type: Datatype::Int64, - .. - })) + Err(TileDBError::Datatype( + DatatypeError::PhysicalTypeIncompatible { + logical_type: Datatype::Int64, + .. + } + )) )); } @@ -920,19 +931,23 @@ mod tests { let e = try_apply!(AggregateFunction::Mean("a".to_owned()), i32); assert!(matches!( e, - Err(TileDBError::Datatype(DatatypeErrorKind::TypeMismatch { - tiledb_type: Datatype::Float64, - .. - })) + Err(TileDBError::Datatype( + DatatypeError::PhysicalTypeIncompatible { + logical_type: Datatype::Float64, + .. + } + )) )); let e = try_apply!(AggregateFunction::Mean("a".to_owned()), f32); assert!(matches!( e, - Err(TileDBError::Datatype(DatatypeErrorKind::TypeMismatch { - tiledb_type: Datatype::Float64, - .. - })) + Err(TileDBError::Datatype( + DatatypeError::PhysicalTypeIncompatible { + logical_type: Datatype::Float64, + .. + } + )) )); } @@ -965,7 +980,7 @@ mod tests { .build()? }; - let test_uri = tiledb_test_utils::get_uri_generator() + let test_uri = uri::get_uri_generator() .map_err(|e| Error::Other(e.to_string()))?; let uri = test_uri diff --git a/tiledb/api/src/query/read/callback.rs b/tiledb/api/src/query/read/callback.rs index 128644a3..5925aca0 100644 --- a/tiledb/api/src/query/read/callback.rs +++ b/tiledb/api/src/query/read/callback.rs @@ -4,7 +4,6 @@ use anyhow::anyhow; use itertools::izip; use paste::paste; -use crate::datatype::PhysicalType; use crate::query::buffer::RefTypedQueryBuffersMut; use crate::query::read::output::{ FromQueryOutput, RawReadOutput, TypedRawReadOutput, @@ -14,7 +13,7 @@ macro_rules! trait_read_callback { ($name:ident, $($U:ident),+) => { pub trait $name: Sized { $( - type $U: PhysicalType; + type $U: CellValue; )+ type Intermediate; type Final; @@ -96,7 +95,7 @@ impl FnMutAdapter { impl ReadCallback for FnMutAdapter where A: FromQueryOutput, - ::Unit: PhysicalType, + ::Unit: CellValue, F: Clone + FnMut(A), { type Unit = ::Unit; @@ -143,7 +142,7 @@ macro_rules! fn_mut_adapter_tuple { impl<$($A),+, F> $callback for FnMutAdapter<($($A),+), F> where $( $A: FromQueryOutput, - <$A as FromQueryOutput>::Unit: PhysicalType + <$A as FromQueryOutput>::Unit: CellValue ),+, F: Clone + FnMut($($A),+) { @@ -428,7 +427,7 @@ mod impls { impl ReadCallback for Vec where - C: PhysicalType, + C: CellValue, { type Unit = C; type Intermediate = (); @@ -453,7 +452,7 @@ mod impls { impl ReadCallback for (Vec, Vec) where - C: PhysicalType, + C: CellValue, { type Unit = C; type Intermediate = (); @@ -485,7 +484,7 @@ mod impls { impl ReadCallback for Vec> where - C: PhysicalType, + C: CellValue, { type Unit = C; type Intermediate = (); @@ -1186,7 +1185,7 @@ mod tests { fn do_read_result_repr(dst_unit_capacity: usize, unitsrc: Vec) where - C: PhysicalType, + C: CellValue, { let alloc = NonVarSized { capacity: dst_unit_capacity, @@ -1370,7 +1369,7 @@ mod tests { fn read_result_strings( record_capacity in MIN_RECORDS..=MAX_RECORDS, byte_capacity in MIN_BYTE_CAPACITY..=MAX_BYTE_CAPACITY, - stringsrc in crate::query::buffer::strategy::prop_string_vec( + stringsrc in crate::query::buffer::tests::prop_string_vec( (MIN_RECORDS..=MAX_RECORDS).into() ) ) diff --git a/tiledb/api/src/query/read/mod.rs b/tiledb/api/src/query/read/mod.rs index 7cc84e21..9b7e30f1 100644 --- a/tiledb/api/src/query/read/mod.rs +++ b/tiledb/api/src/query/read/mod.rs @@ -106,13 +106,13 @@ impl ReadStepOutput { /// # Examples /// /// ``` - /// use tiledb::query::ReadStepOutput; + /// use tiledb_api::query::ReadStepOutput; /// let r = ReadStepOutput::::Intermediate("tiledb".to_string()); /// assert_eq!("tiledb", r.unwrap_intermediate()); /// ``` /// /// ```should_panic - /// use tiledb::query::ReadStepOutput; + /// use tiledb_api::query::ReadStepOutput; /// let r = ReadStepOutput::::Final("tiledb".to_string()); /// assert_eq!("tiledb", r.unwrap_intermediate()); // fails /// ``` @@ -133,13 +133,13 @@ impl ReadStepOutput { /// # Examples /// /// ``` - /// use tiledb::query::ReadStepOutput; + /// use tiledb_api::query::ReadStepOutput; /// let r = ReadStepOutput::::Final("tiledb".to_string()); /// assert_eq!("tiledb", r.unwrap_final()); /// ``` /// /// ```should_panic - /// use tiledb::query::ReadStepOutput; + /// use tiledb_api::query::ReadStepOutput; /// let r = ReadStepOutput::::Intermediate("tiledb".to_string()); /// assert_eq!("tiledb", r.unwrap_final()); // fails /// ``` diff --git a/tiledb/api/src/query/read/output/arrow.rs b/tiledb/api/src/query/read/output/arrow.rs index d4b00fb0..3d13fd77 100644 --- a/tiledb/api/src/query/read/output/arrow.rs +++ b/tiledb/api/src/query/read/output/arrow.rs @@ -6,12 +6,13 @@ use arrow::array::{ GenericListArray, LargeBinaryArray, PrimitiveArray, }; use arrow::datatypes::Field; +use tiledb_common::array::CellValNum; +use tiledb_common::datatype::Datatype; -use crate::array::CellValNum; use crate::datatype::arrow::ArrowPrimitiveTypeNative; use crate::query::buffer::arrow::{Celled, QueryBufferArrowArray}; use crate::query::read::output::{RawReadOutput, TypedRawReadOutput}; -use crate::{typed_query_buffers_go, Datatype}; +use crate::typed_query_buffers_go; impl TryFrom> for QueryBufferArrowArray where @@ -53,12 +54,12 @@ impl TryFrom> for Arc { * we have an array of something like `i64` and need to * turn that into `ADT::Time64(TimeUnit::Microsecond)` for example. */ - let arrow_datatype = crate::datatype::arrow::to_arrow( + let arrow_datatype = tiledb_common::datatype::arrow::to_arrow( &datatype, CellValNum::single(), ); let list_field_metadata = if arrow_datatype.is_inexact() { - HashMap::from([(crate::datatype::arrow::ARROW_FIELD_METADATA_KEY_TILEDB_TYPE_HINT.to_string(), datatype.to_string())]) + HashMap::from([(tiledb_common::datatype::arrow::ARROW_FIELD_METADATA_KEY_TILEDB_TYPE_HINT.to_string(), datatype.to_string())]) } else { HashMap::new() }; @@ -385,7 +386,7 @@ mod tests { } fn do_raw_read_to_record_batch(rr: TypedRawReadOutput) { - let arrow = crate::datatype::arrow::to_arrow( + let arrow = tiledb_common::datatype::arrow::to_arrow( &rr.datatype, rr.cell_structure().as_cell_val_num(), ) diff --git a/tiledb/api/src/query/read/output/mod.rs b/tiledb/api/src/query/read/output/mod.rs index c90563fa..83191bdf 100644 --- a/tiledb/api/src/query/read/output/mod.rs +++ b/tiledb/api/src/query/read/output/mod.rs @@ -3,20 +3,22 @@ use std::iter::FusedIterator; use std::num::{NonZeroU32, NonZeroUsize}; use anyhow::anyhow; -use serde_json::json; use crate::array::CellValNum; use crate::datatype::PhysicalType; -use crate::error::{DatatypeErrorKind, Error}; +use crate::error::Error; use crate::query::buffer::*; +use crate::query::CellValue; use crate::Result as TileDBResult; use crate::{typed_query_buffers_go, Datatype}; #[cfg(feature = "arrow")] pub mod arrow; + #[cfg(any(test, feature = "proptest-strategies"))] pub mod strategy; +#[derive(Debug)] pub struct RawReadOutput<'data, C> { pub ncells: usize, pub input: QueryBuffers<'data, C>, @@ -41,10 +43,13 @@ impl RawReadOutput<'_, C> { self.nvalues() * std::mem::size_of::() } - fn to_json(&self) -> serde_json::value::Value + #[cfg(feature = "serde")] + pub fn to_json(&self) -> serde_json::value::Value where C: Debug, { + use serde_json::json; + let cell_json = match self.input.cell_structure { CellStructure::Fixed(nz) => json!({"cell_val_num": nz}), CellStructure::Var(ref offsets) => json!({ @@ -76,15 +81,7 @@ impl RawReadOutput<'_, C> { } } -impl Debug for RawReadOutput<'_, C> -where - C: Debug, -{ - fn fmt(&self, f: &mut Formatter) -> FmtResult { - write!(f, "{}", self.to_json()) - } -} - +#[derive(Debug)] pub struct TypedRawReadOutput<'data> { pub datatype: Datatype, pub ncells: usize, @@ -127,20 +124,6 @@ impl<'data> TypedRawReadOutput<'data> { } } -impl Debug for TypedRawReadOutput<'_> { - fn fmt(&self, f: &mut Formatter) -> FmtResult { - let mut json = typed_query_buffers_go!(self.buffers, _DT, ref qb, { - RawReadOutput { - ncells: self.ncells, - input: qb.borrow(), - } - .to_json() - }); - json["datatype"] = json!(self.datatype); - write!(f, "{}", json) - } -} - /// Represents either a fixed number of values per cell, /// or the scratch space needed to write the offsets needed to determine /// the variable number of values per cell. @@ -622,13 +605,10 @@ impl<'data, C> CellStructureSingleIterator<'data, C> { ) -> TileDBResult { match QueryBuffersCellStructureSingle::try_from(input) { Ok(qb) => Ok(Self::new(ncells, qb)), - Err(qb) => Err(Error::Datatype( - DatatypeErrorKind::UnexpectedCellStructure { - context: None, - expected: CellValNum::single(), - found: qb.cell_structure.as_cell_val_num(), - }, - )), + Err(qb) => Err(Error::UnexpectedCellStructure { + expected: CellValNum::single(), + found: qb.cell_structure.as_cell_val_num(), + }), } } } @@ -727,13 +707,10 @@ impl<'data, C> FixedDataIterator<'data, C> { "FixedDataIterator cannot take ownership of data inside QueryBuffers"))) } else { assert!(!QueryBuffersCellStructureFixed::accept(&input)); - Err(Error::Datatype( - DatatypeErrorKind::UnexpectedCellStructure { - context: None, - expected: CellValNum::single(), /* TODO: this is not really accurate, any Fixed */ - found: input.cell_structure.as_cell_val_num(), - }, - )) + Err(Error::UnexpectedCellStructure { + expected: CellValNum::single(), /* TODO: this is not really accurate, any Fixed */ + found: input.cell_structure.as_cell_val_num(), + }) } } } @@ -813,13 +790,10 @@ impl<'data, C> VarDataIterator<'data, C> { "VarDataIterator cannot take ownership of data inside QueryBuffers"))) } else { assert!(!QueryBuffersCellStructureVar::accept(&input)); - Err(Error::Datatype( - DatatypeErrorKind::UnexpectedCellStructure { - context: None, - expected: CellValNum::Var, - found: input.cell_structure.as_cell_val_num(), - }, - )) + Err(Error::UnexpectedCellStructure { + expected: CellValNum::Var, + found: input.cell_structure.as_cell_val_num(), + }) } } } @@ -930,7 +904,7 @@ pub trait FromQueryOutput: Sized { impl FromQueryOutput for C where - C: PhysicalType, + C: CellValue, { type Unit = C; type Iterator<'data> diff --git a/tiledb/api/src/query/read/typed.rs b/tiledb/api/src/query/read/typed.rs index f4c19f75..8fe8f132 100644 --- a/tiledb/api/src/query/read/typed.rs +++ b/tiledb/api/src/query/read/typed.rs @@ -1,6 +1,5 @@ use super::*; - -use crate::datatype::PhysicalType; +use crate::query::CellValue; pub trait ReadResult: Sized { type Constructor: ReadCallback; @@ -113,14 +112,14 @@ mod impls { impl ReadResult for Vec where - C: PhysicalType, + C: CellValue, { type Constructor = Self; } impl ReadResult for (Vec, Vec) where - C: PhysicalType, + C: CellValue, { type Constructor = Self; } diff --git a/tiledb/api/src/query/strategy.rs b/tiledb/api/src/query/strategy.rs index bfacf424..edc27788 100644 --- a/tiledb/api/src/query/strategy.rs +++ b/tiledb/api/src/query/strategy.rs @@ -1,116 +1,82 @@ -use std::cmp::Ordering; -use std::collections::hash_map::Entry; use std::collections::HashMap; -use std::fmt::Debug; -use std::ops::{Range, RangeInclusive}; use std::rc::Rc; -use paste::paste; -use proptest::bits::{BitSetLike, VarBitSet}; -use proptest::collection::SizeRange; -use proptest::prelude::*; -use proptest::strategy::{NewTree, ValueTree}; -use proptest::test_runner::TestRunner; -use tiledb_test_utils::strategy::records::{Records, RecordsValueTree}; - -use crate::array::schema::FieldData as SchemaField; -use crate::array::{ArrayType, CellValNum, SchemaData}; -use crate::datatype::physical::{BitsEq, BitsOrd, IntegralType}; -use crate::error::Error; +use cells::write::{ + DenseWriteInput, SparseWriteInput, WriteInput, WriteInputRef, +}; +use cells::{typed_field_data_go, Cells, FieldData}; +use tiledb_common::array::{ArrayType, CellValNum}; +use tiledb_common::physical_type_go; +use tiledb_pod::array::dimension::strategy::Requirements as DimensionRequirements; +use tiledb_pod::array::domain::strategy::Requirements as DomainRequirements; +use tiledb_pod::array::schema::strategy::Requirements as SchemaRequirements; +use tiledb_pod::filter::strategy::Requirements as FilterRequirements; + +use super::*; use crate::query::read::output::{ CellStructureSingleIterator, FixedDataIterator, RawReadOutput, TypedRawReadOutput, VarDataIterator, }; use crate::query::read::{ - CallbackVarArgReadBuilder, FieldMetadata, ManagedBuffer, Map, - RawReadHandle, ReadCallbackVarArg, ReadQueryBuilder, TypedReadHandle, -}; -use crate::query::WriteBuilder; -use crate::{ - dimension_constraints_go, physical_type_go, typed_query_buffers_go, - Datatype, Result as TileDBResult, + CallbackVarArgReadBuilder, FieldMetadata, ManagedBuffer, Map, MapAdapter, + RawReadHandle, ReadCallbackVarArg, TypedReadHandle, }; +use crate::typed_query_buffers_go; -/// Represents the write query input for a single field. +/// Returns a base set of requirements for filters to be used +/// in write queries. /// -/// For each variant, the outer Vec is the collection of records, and the interior is value in the -/// cell for the record. Fields with cell val num of 1 are flat, and other cell values use the -/// inner Vec. For fixed-size attributes, the inner Vecs shall all have the same length; for -/// var-sized attributes that is obviously not required. -#[derive(Clone, Debug, PartialEq)] -pub enum FieldData { - UInt8(Vec), - UInt16(Vec), - UInt32(Vec), - UInt64(Vec), - Int8(Vec), - Int16(Vec), - Int32(Vec), - Int64(Vec), - Float32(Vec), - Float64(Vec), - VecUInt8(Vec>), - VecUInt16(Vec>), - VecUInt32(Vec>), - VecUInt64(Vec>), - VecInt8(Vec>), - VecInt16(Vec>), - VecInt32(Vec>), - VecInt64(Vec>), - VecFloat32(Vec>), - VecFloat64(Vec>), -} - -macro_rules! typed_field_data { - ($($V:ident : $U:ty),+) => { - $( - impl From> for FieldData { - fn from(value: Vec<$U>) -> Self { - FieldData::$V(value) - } - } - - impl From>> for FieldData { - fn from(value: Vec>) -> Self { - paste! { - FieldData::[< Vec $V >](value) - } - } - } - - impl TryFrom for Vec<$U> { - type Error = Error; - - fn try_from(value: FieldData) -> Result { - if let FieldData::$V(values) = value { - Ok(values) - } else { - crate::typed_field_data_go!(value, DT, _, - { - Err(Error::physical_type_mismatch::<$U, DT>()) - }, - { - Err(Error::physical_type_mismatch::<$U, Vec
>()) - }) - } - } - } - )+ - }; -} - -typed_field_data!(UInt8: u8, UInt16: u16, UInt32: u32, UInt64: u64); -typed_field_data!(Int8: i8, Int16: i16, Int32: i32, Int64: i64); -typed_field_data!(Float32: f32, Float64: f64); - -impl From> for FieldData { - fn from(value: Vec) -> Self { - FieldData::from( - value - .into_iter() - .map(|s| s.into_bytes()) - .collect::>>(), - ) +/// Requirements are chosen to either avoid +/// constraints on input (e.g. positive delta filtering requires +/// sorted input, float scale filtering is not invertible) +/// or to avoid issues in the tiledb core library in as +/// many scenarios as possible. +// now that we're actually writing data we will hit the fun bugs. +// there are several in the filter pipeline, so we must heavily +// restrict what is allowed until the bugs are fixed. +pub fn query_write_filter_requirements() -> FilterRequirements { + FilterRequirements { + allow_bit_reduction: false, // SC-47560 + allow_bit_shuffle: false, // SC-48409 + allow_byte_shuffle: false, // SC-48409 + allow_positive_delta: false, // nothing yet to ensure sort order + allow_scale_float: false, // not invertible due to precision loss + allow_xor: false, // SC-47328 + allow_compression_rle: false, // probably can be enabled but nontrivial + allow_compression_dict: false, // probably can be enabled but nontrivial + allow_compression_delta: false, // SC-47328 + allow_webp: false, // SC-51250 + ..Default::default() + } +} + +/// Returns a base set of schema requirements for running a query. +/// +/// Requirements are chosen to either avoid constraints on write input +/// or to avoid issues in the tiledb core library in as many scenarios as possible. +pub fn query_write_schema_requirements( + array_type: Option, +) -> SchemaRequirements { + // NB: 1 is the highest number that passes all cases (so don't use the value given by + // `DomainRequirements::default()`) but we want to enable environmental override. + let env_max_dimensions = + DomainRequirements::env_max_dimensions().unwrap_or(1); + + SchemaRequirements { + domain: Some(Rc::new(DomainRequirements { + array_type, + num_dimensions: 1..=env_max_dimensions, + dimension: Some(DimensionRequirements { + filters: Some(Rc::new(query_write_filter_requirements())), + ..Default::default() + }), + ..Default::default() + })), + attribute_filters: Some(Rc::new(query_write_filter_requirements())), + coordinates_filters: Some(Rc::new(query_write_filter_requirements())), + offsets_filters: Some(Rc::new(query_write_filter_requirements())), + validity_filters: Some(Rc::new(query_write_filter_requirements())), + ..Default::default() } } @@ -144,553 +110,189 @@ impl From<&TypedRawReadOutput<'_>> for FieldData { } } -impl Records for FieldData { - fn len(&self) -> usize { - self.len() - } - - fn filter(&self, subset: &VarBitSet) -> Self { - self.filter(subset) - } -} - -/// Applies a generic expression to the interior of a `FieldData` value. -/// -/// The first form of this macro applies the same expression to all variants. -/// The second form enables applying a different expression to the forms -/// with an interior `Vec
` versus `Vec>`. -/// The third form enables applying a different expression to the forms -/// with an interior `Vec
` versus `Vec` versus `Vec>` versus `Vec>`, -/// where `DT` is an integral type and `FT` is a floating-point type. -/// -/// # Examples -/// ``` -/// use tiledb::query::strategy::FieldData; -/// use tiledb::typed_field_data_go; -/// -/// fn dedup_cells(cells: &mut FieldData) { -/// typed_field_data_go!(cells, ref mut cells_interior, cells_interior.dedup()) -/// } -/// let mut cells = FieldData::UInt64(vec![1, 2, 2, 3, 2]); -/// dedup_cells(&mut cells); -/// assert_eq!(cells, FieldData::UInt64(vec![1, 2, 3, 2])); -/// ``` -#[macro_export] -macro_rules! typed_field_data_go { - ($field:expr, $data:pat, $then:expr) => { - $crate::typed_field_data_go!($field, _DT, $data, $then, $then) - }; - ($field:expr, $DT:ident, $data:pat, $fixed:expr, $var:expr) => { - $crate::typed_field_data_go!( - $field, $DT, $data, $fixed, $var, $fixed, $var - ) - }; - ($field:expr, $DT:ident, $data:pat, $integral_fixed:expr, $integral_var:expr, $float_fixed:expr, $float_var:expr) => {{ - use $crate::query::strategy::FieldData; - match $field { - FieldData::UInt8($data) => { - type $DT = u8; - $integral_fixed - } - FieldData::UInt16($data) => { - type $DT = u16; - $integral_fixed - } - FieldData::UInt32($data) => { - type $DT = u32; - $integral_fixed - } - FieldData::UInt64($data) => { - type $DT = u64; - $integral_fixed - } - FieldData::Int8($data) => { - type $DT = i8; - $integral_fixed - } - FieldData::Int16($data) => { - type $DT = i16; - $integral_fixed - } - FieldData::Int32($data) => { - type $DT = i32; - $integral_fixed - } - FieldData::Int64($data) => { - type $DT = i64; - $integral_fixed - } - FieldData::Float32($data) => { - type $DT = f32; - $float_fixed - } - FieldData::Float64($data) => { - type $DT = f64; - $float_fixed - } - FieldData::VecUInt8($data) => { - type $DT = u8; - $integral_var - } - FieldData::VecUInt16($data) => { - type $DT = u16; - $integral_var - } - FieldData::VecUInt32($data) => { - type $DT = u32; - $integral_var - } - FieldData::VecUInt64($data) => { - type $DT = u64; - $integral_var - } - FieldData::VecInt8($data) => { - type $DT = i8; - $integral_var - } - FieldData::VecInt16($data) => { - type $DT = i16; - $integral_var - } - FieldData::VecInt32($data) => { - type $DT = i32; - $integral_var - } - FieldData::VecInt64($data) => { - type $DT = i64; - $integral_var - } - FieldData::VecFloat32($data) => { - type $DT = f32; - $float_var - } - FieldData::VecFloat64($data) => { - type $DT = f64; - $float_var - } - } - }}; -} - -/// Applies a generic expression to the interiors of two `FieldData` values with matching variants, -/// i.e. with the same physical data type. Typical usage is for comparing the insides of the two -/// `FieldData` values. -macro_rules! typed_field_data_cmp { - ($lexpr:expr, $rexpr:expr, $DT:ident, $lpat:pat, $rpat:pat, $same_type:expr, $else:expr) => {{ - use $crate::query::strategy::FieldData; - match ($lexpr, $rexpr) { - (FieldData::UInt8($lpat), FieldData::UInt8($rpat)) => { - type $DT = u8; - $same_type - } - (FieldData::UInt16($lpat), FieldData::UInt16($rpat)) => { - type $DT = u16; - $same_type - } - (FieldData::UInt32($lpat), FieldData::UInt32($rpat)) => { - type $DT = u32; - $same_type - } - (FieldData::UInt64($lpat), FieldData::UInt64($rpat)) => { - type $DT = u64; - $same_type - } - (FieldData::Int8($lpat), FieldData::Int8($rpat)) => { - type $DT = i8; - $same_type - } - (FieldData::Int16($lpat), FieldData::Int16($rpat)) => { - type $DT = i16; - $same_type - } - (FieldData::Int32($lpat), FieldData::Int32($rpat)) => { - type $DT = i32; - $same_type - } - (FieldData::Int64($lpat), FieldData::Int64($rpat)) => { - type $DT = i64; - $same_type - } - (FieldData::Float32($lpat), FieldData::Float32($rpat)) => { - type $DT = f32; - $same_type - } - (FieldData::Float64($lpat), FieldData::Float64($rpat)) => { - type $DT = f64; - $same_type - } - (FieldData::VecUInt8($lpat), FieldData::VecUInt8($rpat)) => { - type $DT = u8; - $same_type - } - (FieldData::VecUInt16($lpat), FieldData::VecUInt16($rpat)) => { - type $DT = u16; - $same_type - } - (FieldData::VecUInt32($lpat), FieldData::VecUInt32($rpat)) => { - type $DT = u32; - $same_type - } - (FieldData::VecUInt64($lpat), FieldData::VecUInt64($rpat)) => { - type $DT = u64; - $same_type - } - (FieldData::VecInt8($lpat), FieldData::VecInt8($rpat)) => { - type $DT = i8; - $same_type - } - (FieldData::VecInt16($lpat), FieldData::VecInt16($rpat)) => { - type $DT = i16; - $same_type - } - (FieldData::VecInt32($lpat), FieldData::VecInt32($rpat)) => { - type $DT = i32; - $same_type - } - (FieldData::VecInt64($lpat), FieldData::VecInt64($rpat)) => { - type $DT = i64; - $same_type - } - (FieldData::VecFloat32($lpat), FieldData::VecFloat32($rpat)) => { - type $DT = f32; - $same_type - } - (FieldData::VecFloat64($lpat), FieldData::VecFloat64($rpat)) => { - type $DT = f64; - $same_type - } - _ => $else, - } - }}; -} - -impl FieldData { - pub fn is_empty(&self) -> bool { - typed_field_data_go!(self, v, v.is_empty()) - } - - pub fn len(&self) -> usize { - typed_field_data_go!(self, v, v.len()) - } +impl ToReadQuery for Cells { + type ReadBuilder<'data, B> = + CallbackVarArgReadBuilder<'data, RawResultCallback, B>; - /// Returns the number of null values. - /// - /// At this time, values in `FieldData` are not nullable, so this is always zero. - pub fn null_count(&self) -> usize { - 0 - } + fn attach_read<'data, B>( + &self, + b: B, + ) -> TileDBResult> + where + B: ReadQueryBuilder<'data>, + { + let field_order = self.fields().keys().cloned().collect::>(); + let handles = { + let schema = b.base().array().schema().unwrap(); - pub fn is_cell_single(&self) -> bool { - typed_field_data_go!(self, _DT, _, true, false) - } + field_order + .iter() + .map(|name| { + let field = schema.field(name.clone()).unwrap(); + physical_type_go!(field.datatype().unwrap(), DT, { + let managed: ManagedBuffer
= ManagedBuffer::new( + field.query_scratch_allocator(None).unwrap(), + ); + let metadata = FieldMetadata::try_from(&field).unwrap(); + let rr = RawReadHandle::managed(metadata, managed); + TypedReadHandle::from(rr) + }) + }) + .collect::>() + }; - pub fn slice(&self, start: usize, len: usize) -> FieldData { - typed_field_data_go!(self, ref values, { - FieldData::from(values[start..start + len].to_vec().clone()) - }) + b.register_callback_var(handles, RawResultCallback { field_order }) } +} - pub fn filter(&self, set: &VarBitSet) -> FieldData { - typed_field_data_go!(self, ref values, { - FieldData::from( - values - .clone() - .into_iter() - .enumerate() - .filter(|&(i, _)| set.test(i)) - .map(|(_, e)| e) - .collect::>(), - ) - }) - } +impl ToReadQuery for DenseWriteInput { + type ReadBuilder<'data, B> = CallbackVarArgReadBuilder< + 'data, + MapAdapter, + B, + > where Self: 'data; - pub fn truncate(&mut self, len: usize) { - typed_field_data_go!(self, ref mut data, data.truncate(len)) - } + fn attach_read<'data, B>( + &'data self, + b: B, + ) -> TileDBResult> + where + B: ReadQueryBuilder<'data>, + { + let mut subarray = b.start_subarray()?; - pub fn sort(&mut self) { - typed_field_data_go!( - self, - DT, - ref mut data, - { - let cmp = |k1: &DT, k2: &DT| k1.bits_cmp(k2); - data.sort_by(cmp) - }, - { - let cmp = |k1: &Vec
, k2: &Vec
| k1.bits_cmp(k2); - data.sort_by(cmp) - } - ); - } + for i in 0..self.subarray.len() { + subarray = subarray.add_range(i, self.subarray[i].clone())?; + } - pub fn extend(&mut self, other: Self) { - typed_field_data_cmp!( - self, - other, - _DT, - ref mut data, - other_data, - { - // the field types match - data.extend(other_data); - }, - { - // if they do not match - panic!("Field types do not match in `FieldData::extend`") - } - ) - } -} + let b: B = subarray.finish_subarray()?.layout(self.layout)?; -impl BitsEq for FieldData { - fn bits_eq(&self, other: &Self) -> bool { - typed_field_data_cmp!( - self, - other, - _DT, - ref data, - ref other_data, - data.bits_eq(other_data), // match - false // fields do not match - ) + Ok(self.data.attach_read(b)?.map(CellsConstructor::new())) } } -#[derive(Clone, Debug)] -pub enum FieldStrategyDatatype { - Datatype(Datatype, CellValNum), - SchemaField(SchemaField), -} +impl ToReadQuery for SparseWriteInput { + type ReadBuilder<'data, B> = CallbackVarArgReadBuilder< + 'data, + MapAdapter, + B, + >; -pub enum FieldValueStrategy { - UInt8(BoxedStrategy), - UInt16(BoxedStrategy), - UInt32(BoxedStrategy), - UInt64(BoxedStrategy), - Int8(BoxedStrategy), - Int16(BoxedStrategy), - Int32(BoxedStrategy), - Int64(BoxedStrategy), - Float32(BoxedStrategy), - Float64(BoxedStrategy), + fn attach_read<'data, B>( + &'data self, + b: B, + ) -> TileDBResult> + where + B: ReadQueryBuilder<'data>, + { + Ok(self.data.attach_read(b)?.map(CellsConstructor::new())) + } } -macro_rules! field_value_strategy { - ($($variant:ident : $T:ty),+) => { - $( - impl From> for FieldValueStrategy { - fn from(value: BoxedStrategy<$T>) -> Self { - Self::$variant(value) - } - } +impl ToReadQuery for WriteInput { + type ReadBuilder<'data, B> = CallbackVarArgReadBuilder< + 'data, + MapAdapter, + B, + >; - impl TryFrom for BoxedStrategy<$T> { - type Error = (); - fn try_from(value: FieldValueStrategy) -> Result { - if let FieldValueStrategy::$variant(b) = value { - Ok(b) - } else { - Err(()) - } - } - } - )+ + fn attach_read<'data, B>( + &'data self, + b: B, + ) -> TileDBResult> + where + B: ReadQueryBuilder<'data>, + { + match self { + Self::Dense(ref d) => d.attach_read(b), + Self::Sparse(ref s) => s.attach_read(b), + } } } -field_value_strategy!(UInt8 : u8, UInt16 : u16, UInt32 : u32, UInt64 : u64); -field_value_strategy!(Int8 : i8, Int16 : i16, Int32 : i32, Int64 : i64); -field_value_strategy!(Float32 : f32, Float64 : f64); +impl ToReadQuery for WriteInputRef<'_> { + type ReadBuilder<'data, B> = CallbackVarArgReadBuilder< + 'data, + MapAdapter, + B, + > where Self: 'data; -#[derive(Clone, Debug)] -pub struct FieldDataParameters { - pub nrecords: SizeRange, - pub datatype: Option, - pub value_min_var_size: usize, - pub value_max_var_size: usize, -} - -impl Default for FieldDataParameters { - fn default() -> Self { - FieldDataParameters { - nrecords: (0..=1024).into(), - datatype: None, - value_min_var_size: 1, /* SC-48409 and SC-48428 workaround */ - value_max_var_size: 8, /* TODO */ + fn attach_read<'data, B>( + &'data self, + b: B, + ) -> TileDBResult> + where + B: ReadQueryBuilder<'data>, + { + match self { + Self::Dense(d) => d.attach_read(b), + Self::Sparse(s) => s.attach_read(b), } } } -trait ArbitraryFieldData: Sized { - fn arbitrary( - params: FieldDataParameters, - cell_val_num: CellValNum, - value_strat: BoxedStrategy, - ) -> BoxedStrategy; -} - -impl
ArbitraryFieldData for DT -where - DT: IntegralType, - FieldData: From> + From>>, -{ - fn arbitrary( - params: FieldDataParameters, - cell_val_num: CellValNum, - value_strat: BoxedStrategy, - ) -> BoxedStrategy { - if cell_val_num == 1u32 { - proptest::collection::vec(value_strat, params.nrecords) - .prop_map(FieldData::from) - .boxed() - } else { - let (min, max) = if cell_val_num.is_var_sized() { - (params.value_min_var_size, params.value_max_var_size) - } else { - let fixed_bound = Into::::into(cell_val_num) as usize; - (fixed_bound, fixed_bound) - }; - - let cell_strat = proptest::collection::vec(value_strat, min..=max); - - proptest::collection::vec(cell_strat, params.nrecords) - .prop_map(FieldData::from) - .boxed() +impl ToWriteQuery for Cells { + fn attach_write<'data>( + &'data self, + b: WriteBuilder<'data>, + ) -> TileDBResult> { + let mut b = b; + for f in self.fields().iter() { + b = typed_field_data_go!(f.1, data, b.data_typed(f.0, data))?; } + Ok(b) } } -impl ArbitraryFieldData for f32 { - fn arbitrary( - params: FieldDataParameters, - cell_val_num: CellValNum, - value_strat: BoxedStrategy, - ) -> BoxedStrategy { - let value_strat = value_strat.prop_map(|float| float.to_bits()).boxed(); +impl ToWriteQuery for DenseWriteInput { + fn attach_write<'data>( + &'data self, + b: WriteBuilder<'data>, + ) -> TileDBResult> { + let mut subarray = self.data.attach_write(b)?.start_subarray()?; - fn transform(v: Vec) -> Vec { - v.into_iter().map(f32::from_bits).collect::>() + for i in 0..self.subarray.len() { + subarray = subarray.add_range(i, self.subarray[i].clone())?; } - ::arbitrary( - params, - cell_val_num, - value_strat, - ) - .prop_map(|field_data| match field_data { - FieldData::UInt32(values) => FieldData::Float32(transform(values)), - FieldData::VecUInt32(values) => FieldData::VecFloat32( - values.into_iter().map(transform).collect::>>(), - ), - _ => unreachable!(), - }) - .boxed() + subarray.finish_subarray()?.layout(self.layout) } } -impl ArbitraryFieldData for f64 { - fn arbitrary( - params: FieldDataParameters, - cell_val_num: CellValNum, - value_strat: BoxedStrategy, - ) -> BoxedStrategy { - let value_strat = value_strat.prop_map(|float| float.to_bits()).boxed(); +impl ToWriteQuery for SparseWriteInput { + fn attach_write<'data>( + &'data self, + b: WriteBuilder<'data>, + ) -> TileDBResult> { + self.data.attach_write(b) + } +} - fn transform(v: Vec) -> Vec { - v.into_iter().map(f64::from_bits).collect::>() +impl ToWriteQuery for WriteInput { + fn attach_write<'data>( + &'data self, + b: WriteBuilder<'data>, + ) -> TileDBResult> { + match self { + Self::Dense(ref d) => d.attach_write(b), + Self::Sparse(ref s) => s.attach_write(b), } - - ::arbitrary( - params, - cell_val_num, - value_strat, - ) - .prop_map(|field_data| match field_data { - FieldData::UInt64(values) => FieldData::Float64(transform(values)), - FieldData::VecUInt64(values) => FieldData::VecFloat64( - values.into_iter().map(transform).collect::>>(), - ), - _ => unreachable!(), - }) - .boxed() } } -impl Arbitrary for FieldData { - type Strategy = BoxedStrategy; - type Parameters = FieldDataParameters; - - fn arbitrary_with(params: Self::Parameters) -> Self::Strategy { - match params.datatype.clone() { - Some(FieldStrategyDatatype::SchemaField( - SchemaField::Dimension(d), - )) => { - let value_strat = d.value_strategy(); - let cell_val_num = d.cell_val_num(); - - dimension_constraints_go!( - d.constraints, - DT, - ref domain, - _, - { -
::arbitrary( - params, - cell_val_num, - value_strat.try_into().unwrap(), - ) - }, - { - ::arbitrary( - params, - cell_val_num, - value_strat.try_into().unwrap(), - ) - } - ) - } - Some(FieldStrategyDatatype::SchemaField( - SchemaField::Attribute(a), - )) => { - let value_strat = a.value_strategy(); - let cell_val_num = - a.cell_val_num.unwrap_or(CellValNum::single()); - - physical_type_go!(a.datatype, DT, { -
::arbitrary( - params, - cell_val_num, - value_strat.try_into().unwrap(), - ) - }) - } - Some(FieldStrategyDatatype::Datatype(datatype, cell_val_num)) => { - physical_type_go!(datatype, DT, { - let value_strat = any::
().boxed(); -
::arbitrary( - params, - cell_val_num, - value_strat, - ) - }) - } - None => (any::(), any::()) - .prop_flat_map(move |(datatype, cell_val_num)| { - physical_type_go!(datatype, DT, { - let value_strat = any::
().boxed(); -
::arbitrary( - params.clone(), - cell_val_num, - value_strat, - ) - }) - }) - .boxed(), +impl ToWriteQuery for WriteInputRef<'_> { + fn attach_write<'data>( + &'data self, + b: WriteBuilder<'data>, + ) -> TileDBResult> { + match self { + Self::Dense(d) => d.attach_write(b), + Self::Sparse(s) => s.attach_write(b), } } } +// TODO: where should these go pub struct RawReadQueryResult(pub HashMap); pub struct RawResultCallback { @@ -758,1459 +360,436 @@ impl Map for CellsConstructor { } } -#[derive(Clone, Debug, PartialEq)] -pub struct Cells { - fields: HashMap, -} +#[cfg(test)] +mod tests { + use cells::write::strategy::{WriteParameters, WriteSequenceParameters}; + use cells::write::{DenseWriteInput, SparseWriteInput, WriteSequence}; + use proptest::prelude::*; + use tiledb_common::range::{NonEmptyDomain, Range}; + use tiledb_pod::array::schema::SchemaData; + use uri::TestArrayUri; -impl Cells { - /// # Panics - /// - /// Panics if the fields do not all have the same number of cells. - pub fn new(fields: HashMap) -> Self { - let mut expect_len: Option = None; - for (_, d) in fields.iter() { - if let Some(expect_len) = expect_len { - assert_eq!(d.len(), expect_len); - } else { - expect_len = Some(d.len()) - } - } + use super::*; + use crate::array::{Array, ArrayOpener, Mode}; + use crate::error::Error; + use crate::query::{ + Query, QueryBuilder, ReadBuilder, ReadQuery, WriteBuilder, + }; + use crate::{Context, Factory}; - Cells { fields } + struct DenseCellsAccumulator { + // TODO: implement accepting more than one write for dense write sequence + write: Option, } - pub fn is_empty(&self) -> bool { - self.fields.values().next().unwrap().is_empty() - } + impl DenseCellsAccumulator { + pub fn new(_: &SchemaData) -> Self { + DenseCellsAccumulator { write: None } + } - pub fn len(&self) -> usize { - self.fields.values().next().unwrap().len() + pub fn cells(&self) -> &Cells { + // will not be called until first cells are written + &self.write.as_ref().unwrap().data + } + + pub fn accumulate(&mut self, write: DenseWriteInput) { + if self.write.is_some() { + unimplemented!() + } + self.write = Some(write) + } + + pub fn attach_read<'data, B>( + &'data self, + b: B, + ) -> TileDBResult< + CallbackVarArgReadBuilder< + 'data, + MapAdapter, + B, + >, + > + where + B: ReadQueryBuilder<'data>, + { + // TODO: this is not correct as we accumulate multiple writes + self.write.as_ref().unwrap().attach_read(b) + } } - pub fn fields(&self) -> &HashMap { - &self.fields + struct SparseCellsAccumulator { + cells: Option, + dedup_keys: Option>, } - pub fn attach_write<'data>( - &'data self, - b: WriteBuilder<'data>, - ) -> TileDBResult> { - let mut b = b; - for f in self.fields.iter() { - b = typed_field_data_go!(f.1, data, b.data_typed(f.0, data))?; + impl SparseCellsAccumulator { + pub fn new(schema: &SchemaData) -> Self { + let dedup_keys = if schema.allow_duplicates.unwrap_or(false) { + None + } else { + Some( + schema + .domain + .dimension + .iter() + .map(|d| d.name.clone()) + .collect::>(), + ) + }; + SparseCellsAccumulator { + cells: None, + dedup_keys, + } } - Ok(b) - } - pub fn attach_read<'data, B>( - &self, - b: B, - ) -> TileDBResult> - where - B: ReadQueryBuilder<'data>, - { - let field_order = self.fields.keys().cloned().collect::>(); - let handles = { - let schema = b.base().array().schema().unwrap(); + pub fn cells(&self) -> &Cells { + // will not be called until first cells arrive + self.cells.as_ref().unwrap() + } - field_order - .iter() - .map(|name| { - let field = schema.field(name.clone()).unwrap(); - physical_type_go!(field.datatype().unwrap(), DT, { - let managed: ManagedBuffer
= ManagedBuffer::new( - field.query_scratch_allocator(None).unwrap(), - ); - let metadata = FieldMetadata::try_from(&field).unwrap(); - let rr = RawReadHandle::managed(metadata, managed); - TypedReadHandle::from(rr) - }) - }) - .collect::>() - }; + /// Update state representing what we expect to see in the array. + /// For a sparse array this means adding this write's coordinates, + /// overwriting the old coordinates if they overlap. + pub fn accumulate(&mut self, mut write: SparseWriteInput) { + if let Some(cells) = self.cells.take() { + write.data.extend(cells); + if let Some(dedup_keys) = self.dedup_keys.as_ref() { + self.cells = Some(write.data.dedup(dedup_keys)); + } else { + self.cells = Some(write.data); + } + } else { + self.cells = Some(write.data); + } + } + + pub fn attach_read<'data, B>( + &'data self, + b: B, + ) -> TileDBResult< + CallbackVarArgReadBuilder< + 'data, + MapAdapter, + B, + >, + > + where + B: ReadQueryBuilder<'data>, + { + Ok(self.cells().attach_read(b)?.map(CellsConstructor::new())) + } + } - b.register_callback_var(handles, RawResultCallback { field_order }) + enum CellsAccumulator { + Dense(DenseCellsAccumulator), + Sparse(SparseCellsAccumulator), } - /// Copies data from the argument. - /// Overwrites data at common indices and extends `self` where necessary. - pub fn copy_from(&mut self, cells: Self) { - for (field, data) in cells.fields.into_iter() { - match self.fields.entry(field) { - Entry::Vacant(v) => { - v.insert(data); + impl CellsAccumulator { + pub fn new(schema: &SchemaData) -> Self { + match schema.array_type { + ArrayType::Dense => { + Self::Dense(DenseCellsAccumulator::new(schema)) } - Entry::Occupied(mut o) => { - let prev_write_data = o.get_mut(); - typed_field_data_cmp!( - prev_write_data, - data, - _DT, - ref mut mine, - theirs, - { - if mine.len() <= theirs.len() { - *mine = theirs; - } else { - mine[0..theirs.len()] - .clone_from_slice(theirs.as_slice()); - } - }, - unreachable!() - ); + ArrayType::Sparse => { + Self::Sparse(SparseCellsAccumulator::new(schema)) } } } - } - /// Shortens the cells, keeping the first `len` records and dropping the rest. - pub fn truncate(&mut self, len: usize) { - for data in self.fields.values_mut() { - data.truncate(len) + pub fn cells(&self) -> &Cells { + match self { + Self::Dense(ref d) => d.cells(), + Self::Sparse(ref s) => s.cells(), + } } - } - /// Extends this cell data with the contents of another. - /// - /// # Panics - /// - /// Panics if the set of fields in `self` and `other` do not match. - /// - /// Panics if any field in `self` and `other` has a different type. - pub fn extend(&mut self, other: Self) { - let mut other = other; - for (field, data) in self.fields.iter_mut() { - let other_data = other.fields.remove(field).unwrap(); - data.extend(other_data); - } - assert_eq!(other.fields.len(), 0); - } - - /// Returns a view over a slice of the cells, - /// with a subset of the fields viewed as indicated by `keys`. - /// This is useful for comparing a section of `self` to another `Cells` instance. - pub fn view<'a>( - &'a self, - keys: &'a [String], - slice: Range, - ) -> CellsView<'a> { - for k in keys.iter() { - if !self.fields.contains_key(k) { - panic!("Cannot construct view: key '{}' not found (fields are {:?})", - k, self.fields.keys()) + pub fn accumulate(&mut self, write: WriteInput) { + match write { + WriteInput::Sparse(w) => { + let Self::Sparse(ref mut sparse) = self else { + unreachable!() + }; + sparse.accumulate(w) + } + WriteInput::Dense(w) => { + let Self::Dense(ref mut dense) = self else { + unreachable!() + }; + dense.accumulate(w) + } } } - CellsView { - cells: self, - keys, - slice, - } - } - - /// Returns a comparator for ordering indices into the cells. - fn index_comparator<'a>( - &'a self, - keys: &'a [String], - ) -> impl Fn(&usize, &usize) -> Ordering + 'a { - move |l: &usize, r: &usize| -> Ordering { - for key in keys.iter() { - typed_field_data_go!(self.fields[key], ref data, { - match BitsOrd::bits_cmp(&data[*l], &data[*r]) { - Ordering::Less => return Ordering::Less, - Ordering::Greater => return Ordering::Greater, - Ordering::Equal => continue, - } - }) + pub fn attach_read<'data, B>( + &'data self, + b: B, + ) -> TileDBResult< + CallbackVarArgReadBuilder< + 'data, + MapAdapter, + B, + >, + > + where + B: ReadQueryBuilder<'data>, + { + match self { + Self::Dense(ref d) => d.attach_read(b), + Self::Sparse(ref s) => s.attach_read(b), } - Ordering::Equal } } - /// Returns whether the cells are sorted according to `keys`. See `Self::sort`. - pub fn is_sorted(&self, keys: &[String]) -> bool { - let index_comparator = self.index_comparator(keys); - for i in 1..self.len() { - if index_comparator(&(i - 1), &i) == Ordering::Greater { - return false; - } - } - true - } + fn do_write_readback( + ctx: &Context, + schema_spec: Rc, + write_sequence: WriteSequence, + ) -> TileDBResult<()> { + let test_uri = uri::get_uri_generator() + .map_err(|e| Error::Other(e.to_string()))?; + let uri = test_uri + .with_path("array") + .map_err(|e| Error::Other(e.to_string()))?; - /// Sorts the cells using `keys`. If two elements are equal on the first item in `keys`, - /// then they will be ordered using the second; and so on. - /// May not preserve the order of elements which are equal for all fields in `keys`. - pub fn sort(&mut self, keys: &[String]) { - let mut idx = std::iter::repeat(()) - .take(self.len()) - .enumerate() - .map(|(i, _)| i) - .collect::>(); - - let idx_comparator = self.index_comparator(keys); - idx.sort_by(idx_comparator); - - for data in self.fields.values_mut() { - typed_field_data_go!(data, ref mut data, { - let mut unsorted = std::mem::replace( - data, - vec![Default::default(); data.len()], - ); - for i in 0..unsorted.len() { - data[i] = std::mem::take(&mut unsorted[idx[i]]); - } - }); - } - } + let schema_in = schema_spec + .create(ctx) + .expect("Error constructing arbitrary schema"); + Array::create(ctx, &uri, schema_in).expect("Error creating array"); - /// Returns a copy of the cells, sorted as if by `self.sort()`. - pub fn sorted(&self, keys: &[String]) -> Self { - let mut sorted = self.clone(); - sorted.sort(keys); - sorted - } - - /// Returns the list of offsets beginning each group, i.e. run of contiguous values on `keys`. - /// - /// This is best used with sorted cells, but that is not required. - /// For each pair of offsets in the output, all cells in that index range are equal; - /// and the adjacent cells outside of the range are not equal. - pub fn identify_groups(&self, keys: &[String]) -> Option> { - if self.is_empty() { - return None; - } - let mut groups = vec![0]; - let mut icmp = 0; - for i in 1..self.len() { - let distinct = keys.iter().any(|k| { - let v = self.fields().get(k).unwrap(); - typed_field_data_go!( - v, - ref cells, - cells[i].bits_ne(&cells[icmp]) - ) - }); - if distinct { - groups.push(i); - icmp = i; - } - } - groups.push(self.len()); - Some(groups) - } - - /// Returns the number of distinct values grouped on `keys` - pub fn count_distinct(&self, keys: &[String]) -> usize { - if self.len() <= 1 { - return self.len(); - } + let mut accumulated_domain: Option = None; + let mut accumulated_write = CellsAccumulator::new(&schema_spec); - let key_cells = { - let key_fields = self - .fields + /* + * Results do not come back in a defined order, so we must sort and + * compare. Writes currently have to write all fields. + */ + let sort_keys = match write_sequence { + WriteSequence::Dense(_) => schema_spec + .attributes .iter() - .filter(|(k, _)| keys.contains(k)) - .map(|(k, v)| (k.clone(), v.clone())) - .collect::>(); - Cells::new(key_fields).sorted(keys) + .map(|f| f.name.clone()) + .collect::>(), + WriteSequence::Sparse(_) => schema_spec + .fields() + .map(|f| f.name().to_owned()) + .collect::>(), }; - let mut icmp = 0; - let mut count = 1; - - for i in 1..key_cells.len() { - let distinct = keys.iter().any(|k| { - let v = key_cells.fields().get(k).unwrap(); - typed_field_data_go!( - v, - ref cells, - cells[i].bits_ne(&cells[icmp]) - ) - }); - if distinct { - icmp = i; - count += 1; - } - } - - count - } + for write in write_sequence { + /* write data and preserve ranges for sanity check */ + let write_ranges = { + let array = Array::open(ctx, &uri, Mode::Write) + .expect("Error opening array"); - /// Returns a subset of the records using the bitmap to determine which are included - pub fn filter(&self, set: &VarBitSet) -> Cells { - Self::new( - self.fields() - .iter() - .map(|(k, v)| (k.clone(), v.filter(set))) - .collect::>(), - ) - } - - /// Returns a subset of `self` containing only cells which have distinct values in `keys` - /// such that `self.dedup(keys).count_distinct(keys) == self.len()`. - /// The order of cells in the input is preserved and the - /// first cell for each value of `keys` is preserved in the output. - pub fn dedup(&self, keys: &[String]) -> Cells { - if self.is_empty() { - return self.clone(); - } - - let mut idx = (0..self.len()).collect::>(); - - let idx_comparator = self.index_comparator(keys); - idx.sort_by(idx_comparator); - - let mut icmp = 0; - let mut preserve = VarBitSet::new_bitset(idx.len()); - preserve.set(idx[0]); - - for i in 1..idx.len() { - let distinct = keys.iter().any(|k| { - let v = self.fields.get(k).unwrap(); - typed_field_data_go!( - v, - ref field_cells, - field_cells[idx[i]].bits_ne(&field_cells[idx[icmp]]) - ) - }); - if distinct { - icmp = i; - preserve.set(idx[i]); - } - } - - self.filter(&preserve) - } + let write_query = write + .attach_write( + WriteBuilder::new(array) + .expect("Error building write query"), + ) + .expect("Error building write query") + .build(); + write_query.submit().expect("Error running write query"); - /// Returns a copy of `self` with only the fields in `fields`, - /// or `None` if not all the requested fields are present. - pub fn projection(&self, fields: &[&str]) -> Option { - let projection = fields - .iter() - .map(|f| { - self.fields - .get(*f) - .map(|data| (f.to_string(), data.clone())) - }) - .collect::>>()?; - Some(Cells::new(projection)) - } + let write_ranges = if let Some(ranges) = write.subarray() { + let generic_ranges = ranges + .iter() + .cloned() + .map(|r| vec![r]) + .collect::>>(); + assert_eq!( + generic_ranges, + write_query.subarray().unwrap().ranges().unwrap() + ); + Some(generic_ranges) + } else { + None + }; - /// Adds an additional field to `self`. Returns `true` if successful, - /// i.e. the field data is valid for the current set of cells - /// and there is not already a field for the key. - pub fn add_field(&mut self, key: &str, values: FieldData) -> bool { - if self.len() != values.len() { - return false; - } + let _ = write_query + .finalize() + .expect("Error finalizing write query"); - if self.fields.contains_key(key) { - false - } else { - self.fields.insert(key.to_owned(), values); - true - } - } -} + write_ranges + }; -impl BitsEq for Cells { - fn bits_eq(&self, other: &Self) -> bool { - for (key, mine) in self.fields().iter() { - if let Some(theirs) = other.fields().get(key) { - if !mine.bits_eq(theirs) { - return false; - } - } else { - return false; + if write.cells().is_empty() { + // in this case, writing and finalizing does not create a new fragment + // TODO + continue; } - } - self.fields().keys().len() == other.fields().keys().len() - } -} - -pub struct StructuredCells { - dimensions: Vec, - cells: Cells, -} - -impl StructuredCells { - pub fn new(dimensions: Vec, cells: Cells) -> Self { - let expected_cells: usize = dimensions.iter().cloned().product(); - assert_eq!(expected_cells, cells.len(), "Dimensions: {:?}", dimensions); - - StructuredCells { dimensions, cells } - } - - pub fn num_dimensions(&self) -> usize { - self.dimensions.len() - } - - /// Returns the span of dimension `d` - pub fn dimension_len(&self, d: usize) -> usize { - self.dimensions[d] - } - pub fn into_inner(self) -> Cells { - self.cells - } + /* NB: results are not read back in a defined order, so we must sort and compare */ - pub fn slice(&self, slices: Vec>) -> Self { - assert_eq!(slices.len(), self.dimensions.len()); // this is doable but unimportant + let mut array = ArrayOpener::new(ctx, &uri, Mode::Read) + .unwrap() + .open() + .unwrap(); - struct NextIndex<'a> { - dimensions: &'a [usize], - ranges: &'a [Range], - cursors: Option>, - } + /* + * First check fragment - its domain should match what we just wrote, and we need the + * timestamp so we can read back only this fragment + */ + let [timestamp_min, timestamp_max] = { + let fi = array.fragment_info().unwrap(); + let nf = fi.num_fragments().unwrap(); + assert!(nf > 0); - impl<'a> NextIndex<'a> { - fn new( - dimensions: &'a [usize], - ranges: &'a [Range], - ) -> Self { - for r in ranges { - if r.is_empty() { - return NextIndex { - dimensions, - ranges, - cursors: None, - }; - } - } + let this_fragment = fi.get_fragment(nf - 1).unwrap(); - NextIndex { - dimensions, - ranges, - cursors: Some( - ranges.iter().map(|r| r.start).collect::>(), - ), - } - } - - fn compute(&self) -> usize { - let Some(cursors) = self.cursors.as_ref() else { - unreachable!() - }; - let mut index = 0; - let mut scale = 1; - for i in 0..self.dimensions.len() { - let i = self.dimensions.len() - i - 1; - index += cursors[i] * scale; - scale *= self.dimensions[i]; - } - index - } - - fn advance(&mut self) { - let Some(cursors) = self.cursors.as_mut() else { - return; - }; - for d in 0..self.dimensions.len() { - let d = self.dimensions.len() - d - 1; - if cursors[d] + 1 < self.ranges[d].end { - cursors[d] += 1; - return; - } else { - cursors[d] = self.ranges[d].start; - } - } - - // this means that we reset the final dimension - self.cursors = None; - } - } - - impl Iterator for NextIndex<'_> { - type Item = usize; - fn next(&mut self) -> Option { - if self.cursors.is_some() { - let index = self.compute(); - self.advance(); - Some(index) + if let Some(write_domain) = write.domain() { + let nonempty_domain = + this_fragment.non_empty_domain().unwrap().untyped(); + assert_eq!(write_domain, nonempty_domain); } else { - None + // most recent fragment should be empty, + // what does that look like if no data was written? } - } - } - - let mut v = VarBitSet::new_bitset(self.cells.len()); - - NextIndex::new(self.dimensions.as_slice(), slices.as_slice()) - .for_each(|idx| v.set(idx)); - - StructuredCells { - dimensions: self.dimensions.clone(), - cells: self.cells.filter(&v), - } - } -} - -#[derive(Clone, Debug)] -pub struct CellsView<'a> { - cells: &'a Cells, - keys: &'a [String], - slice: Range, -} - -impl<'b> PartialEq> for CellsView<'_> { - fn eq(&self, other: &CellsView<'b>) -> bool { - // must have same number of values - if self.slice.len() != other.slice.len() { - return false; - } - for key in self.keys.iter() { - let Some(mine) = self.cells.fields.get(key) else { - // validated on construction - unreachable!() - }; - let Some(theirs) = other.cells.fields.get(key) else { - return false; + this_fragment.timestamp_range().unwrap() }; - typed_field_data_cmp!( - mine, - theirs, - _DT, - ref mine, - ref theirs, - if mine[self.slice.clone()] != theirs[other.slice.clone()] { - return false; - }, - return false - ); - } - - self.keys.len() == other.keys.len() - } -} - -/// Mask for whether a field should be included in a write query. -// As of this writing, core does not support default values being filled in, -// so this construct is not terribly useful. But someday that may change -#[derive(Copy, Clone, Debug, Eq, PartialEq)] -enum FieldMask { - /// This field must appear in the write set - Include, - /// This field appears in the write set but simplification may change that - TentativelyInclude, - /// This field may appear in the write set again after complication - _TentativelyExclude, - /// This field may not appear in the write set again - Exclude, -} + let safety_write_start = std::time::Instant::now(); -impl FieldMask { - pub fn is_included(&self) -> bool { - matches!(self, FieldMask::Include | FieldMask::TentativelyInclude) - } -} - -/// Value tree to shrink cells. -/// For a failing test which writes N records, there are 2^N possible -/// candidate subsets and we want to find the smallest one which fails the test -/// in the shortest number of iterations. -/// That would be ideal but really finding any input that's small enough -/// to be human readable sounds good enough. We divide the record space -/// into CELLS_VALUE_TREE_EXPLORE_PIECES chunks and identify which -/// of those chunks are necessary for the failure. -/// Recur until all of the chunks are necessary for failure, or there -/// is only one record. -/// -/// TODO: for var sized attributes, follow up by shrinking the values. -struct CellsValueTree { - _field_masks: HashMap, - field_data_tree: RecordsValueTree>, -} - -impl CellsValueTree { - pub fn new( - params: CellsParameters, - field_data: HashMap)>, - ) -> Self { - // sanity check - { - let mut nrecords = None; - for f in field_data.values() { - if let Some(f) = f.1.as_ref() { - if let Some(nrecords) = nrecords { - assert_eq!(nrecords, f.len()) - } else { - nrecords = Some(f.len()) - } + /* + * Then re-open the array to read back what we just wrote + * into the most recent fragment only + */ + { + array = array + .reopen() + .start_timestamp(timestamp_min) + .unwrap() + .end_timestamp(timestamp_max) + .unwrap() + .open() + .unwrap(); + + let mut read = write + .attach_read(ReadBuilder::new(array).unwrap()) + .unwrap() + .build(); + + if let Some(write_ranges) = write_ranges { + let read_ranges = + read.subarray().unwrap().ranges().unwrap(); + assert_eq!(write_ranges, read_ranges); } - } - } - - let field_masks = field_data - .iter() - .map(|(fname, &(fmask, _))| (fname.clone(), fmask)) - .collect::>(); - let field_data = field_data - .into_iter() - .filter(|&(_, (fmask, _))| fmask.is_included()) - .map(|(fname, (_, fdata))| (fname, fdata.unwrap())) - .collect::>(); - - let field_data_tree = - RecordsValueTree::new(params.min_records, field_data); - - CellsValueTree { - _field_masks: field_masks, - field_data_tree, - } - } -} - -impl ValueTree for CellsValueTree { - type Value = Cells; - - fn current(&self) -> Self::Value { - Cells::new(self.field_data_tree.current()) - } - - fn simplify(&mut self) -> bool { - self.field_data_tree.simplify() - } - - fn complicate(&mut self) -> bool { - self.field_data_tree.complicate() - } -} -#[derive(Clone, Debug)] -pub enum CellsStrategySchema { - /// Quick-and-dirty set of fields to write to - Fields(HashMap), - /// Schema for writing - WriteSchema(Rc), - /// Schema for reading - ReadSchema(Rc), -} - -impl CellsStrategySchema { - pub fn array_schema(&self) -> Option<&SchemaData> { - match self { - Self::WriteSchema(s) | Self::ReadSchema(s) => Some(s.as_ref()), - _ => None, - } - } - - fn new_field_tree( - &self, - runner: &mut TestRunner, - nrecords: RangeInclusive, - ) -> HashMap)> { - let field_data_parameters_base = FieldDataParameters::default(); - - match self { - Self::Fields(fields) => { - let nrecords = nrecords.new_tree(runner).unwrap().current(); - - let field_mask = fields - .iter() - .map(|(k, v)| { - (k.to_string(), (FieldMask::TentativelyInclude, v)) - }) - .collect::>(); - - field_mask - .into_iter() - .map(|(field, (mask, (datatype, cell_val_num)))| { - let field_data = if mask.is_included() { - let params = FieldDataParameters { - nrecords: (nrecords..=nrecords).into(), - datatype: Some( - FieldStrategyDatatype::Datatype( - *datatype, - *cell_val_num, - ), - ), - ..field_data_parameters_base.clone() - }; - Some( - any_with::(params) - .new_tree(runner) - .unwrap() - .current(), - ) - } else { - None - }; - (field, (mask, field_data)) - }) - .collect::)>>( - ) - } - Self::WriteSchema(schema) => { - let field_mask = { - let dimensions_mask = { - let mask = match schema.array_type { - ArrayType::Dense => { - /* dense array coordinates are handled by a subarray */ - FieldMask::Exclude - } - ArrayType::Sparse => { - /* sparse array must write coordinates */ - FieldMask::Include - } - }; - schema - .domain - .dimension - .iter() - .map(|d| (SchemaField::from(d.clone()), mask)) - .collect::>() - }; + let (mut cells, _) = read.execute().unwrap(); - /* as of this writing, write queries must write to all attributes */ - let attributes_mask = schema - .attributes - .iter() - .map(|a| { - (SchemaField::from(a.clone()), FieldMask::Include) - }) - .collect::>(); - - dimensions_mask - .into_iter() - .chain(attributes_mask) - .collect::>() - }; - - if schema.array_type == ArrayType::Sparse - && !schema.allow_duplicates.unwrap_or(false) + /* `cells` should match the write */ { - // dimension coordinates must be unique, generate them first - let unique_keys = schema - .domain - .dimension - .iter() - .map(|d| d.name.clone()) - .collect::>(); - let dimension_data = schema - .domain - .dimension - .iter() - .map(|d| { - let params = FieldDataParameters { - nrecords: (*nrecords.end()..=*nrecords.end()) - .into(), - datatype: Some( - FieldStrategyDatatype::SchemaField( - SchemaField::Dimension(d.clone()), - ), - ), - ..field_data_parameters_base.clone() - }; - ( - d.name.clone(), - any_with::(params) - .new_tree(runner) - .unwrap() - .current(), - ) - }) - .collect::>(); - - let mut dedup_fields = - Cells::new(dimension_data).dedup(&unique_keys); - - // choose the number of records - let nrecords = { - /* - * TODO: not really accurate but in practice nrecords.start - * is probably zero so this is the easy lazy thing to do - */ - assert!(*nrecords.start() <= dedup_fields.len()); - - (*nrecords.start()..=dedup_fields.len()) - .new_tree(runner) - .unwrap() - .current() - }; - - field_mask.into_iter() - .map(|(field, mask)| { - let field_name = field.name().to_owned(); - let field_data = if let Some(mut dim) = dedup_fields.fields.remove(&field_name) { - assert!(field.is_dimension()); - dim.truncate(nrecords); - dim - } else { - assert!(field.is_attribute()); - let params = FieldDataParameters { - nrecords: (nrecords..=nrecords).into(), - datatype: Some(FieldStrategyDatatype::SchemaField(field)), - ..field_data_parameters_base.clone() - }; - any_with::(params) - .new_tree(runner) - .unwrap() - .current() - }; - assert_eq!(nrecords, field_data.len()); - (field_name, (mask, Some(field_data))) - }) - .collect::)>>() - } else { - let nrecords = nrecords.new_tree(runner).unwrap().current(); - field_mask - .into_iter() - .map(|(field, mask)| { - let field_name = field.name().to_string(); - let field_data = if mask.is_included() { - let params = FieldDataParameters { - nrecords: (nrecords..=nrecords).into(), - datatype: Some( - FieldStrategyDatatype::SchemaField(field), - ), - ..field_data_parameters_base.clone() - }; - Some( - any_with::(params) - .new_tree(runner) - .unwrap() - .current(), - ) - } else { - None - }; - (field_name, (mask, field_data)) - }) - .collect::)>>( - ) + let write_sorted = write.cells().sorted(&sort_keys); + cells.sort(&sort_keys); + assert_eq!(write_sorted, cells); } - } - Self::ReadSchema(_) => { - /* presumably any subset of the fields */ - unimplemented!() - } - } - } -} - -#[derive(Clone, Debug)] -pub struct CellsParameters { - pub schema: Option, - pub min_records: usize, - pub max_records: usize, - pub cell_min_var_size: usize, - pub cell_max_var_size: usize, -} - -impl CellsParameters { - pub fn min_records_default() -> usize { - **crate::strategy::config::TILEDB_STRATEGY_CELLS_PARAMETERS_NUM_RECORDS_MIN - } - - pub fn max_records_default() -> usize { - **crate::strategy::config::TILEDB_STRATEGY_CELLS_PARAMETERS_NUM_RECORDS_MAX - } - pub fn cell_min_var_size_default() -> usize { - **crate::strategy::config::TILEDB_STRATEGY_CELLS_PARAMETERS_CELL_VAR_SIZE_MIN - } - - pub fn cell_max_var_size_default() -> usize { - **crate::strategy::config::TILEDB_STRATEGY_CELLS_PARAMETERS_CELL_VAR_SIZE_MAX - } -} - -impl Default for CellsParameters { - fn default() -> Self { - CellsParameters { - schema: None, - min_records: Self::min_records_default(), - max_records: Self::max_records_default(), - cell_min_var_size: Self::cell_min_var_size_default(), - cell_max_var_size: Self::cell_max_var_size_default(), - } - } -} - -#[derive(Debug)] -struct CellsStrategy { - schema: CellsStrategySchema, - params: CellsParameters, -} - -impl CellsStrategy { - pub fn new(schema: CellsStrategySchema, params: CellsParameters) -> Self { - CellsStrategy { schema, params } - } - - /// Returns an upper bound on the number of cells which can possibly be produced - fn nrecords_limit(&self) -> Option { - if let Some(schema) = self.schema.array_schema() { - if !schema.allow_duplicates.unwrap_or(true) { - return schema.domain.num_cells(); + array = read.finalize().unwrap(); } - } - None - } -} -impl Strategy for CellsStrategy { - type Tree = CellsValueTree; - type Value = Cells; + /* finally, check that everything written up until now is correct */ + array = array.reopen().start_timestamp(0).unwrap().open().unwrap(); - fn new_tree(&self, runner: &mut TestRunner) -> NewTree { - /* Choose the maximum number of records */ - let strat_nrecords = if let Some(limit) = self.nrecords_limit() { - if limit < self.params.min_records { - let r = format!("Schema and parameters are not satisfiable: schema.domain.num_cells() = {}, self.params.min_records = {}", limit, self.params.min_records); - return Err(proptest::test_runner::Reason::from(r)); + /* check array non-empty domain */ + if let Some(accumulated_domain) = accumulated_domain.as_mut() { + let Some(write_domain) = write.domain() else { + unreachable!() + }; + *accumulated_domain = accumulated_domain.union(&write_domain); } else { - let max_records = std::cmp::min(self.params.max_records, limit); - self.params.min_records..=max_records + accumulated_domain = write.domain(); } - } else { - self.params.min_records..=self.params.max_records - }; - - /* generate an initial set of fields to write */ - let field_tree = self.schema.new_field_tree(runner, strat_nrecords); - - Ok(CellsValueTree::new(self.params.clone(), field_tree)) - } -} - -impl Arbitrary for Cells { - type Parameters = CellsParameters; - type Strategy = BoxedStrategy; - - fn arbitrary_with(mut args: Self::Parameters) -> Self::Strategy { - if let Some(schema) = args.schema.take() { - CellsStrategy::new(schema, args).boxed() - } else { - let keys = crate::array::attribute::strategy::prop_attribute_name(); - let values = (any::(), any::()); - proptest::collection::hash_map(keys, values, 1..16) - .prop_flat_map(move |values| { - CellsStrategy::new( - CellsStrategySchema::Fields(values), - args.clone(), - ) - }) - .boxed() - } - } -} - -#[cfg(test)] -mod tests { - use super::*; - use crate::datatype::physical::BitsKeyAdapter; - use std::collections::HashSet; - - fn do_field_data_extend(dst: FieldData, src: FieldData) { - let orig_dst = dst.clone(); - let orig_src = src.clone(); - - let mut dst = dst; - dst.extend(src); - - typed_field_data_go!(dst, dst, { - assert_eq!( - orig_dst, - FieldData::from(dst[0..orig_dst.len()].to_vec()) - ); - assert_eq!( - orig_src, - FieldData::from(dst[orig_dst.len()..dst.len()].to_vec()) - ); - assert_eq!(dst.len(), orig_dst.len() + orig_src.len()); - }) - } - - fn do_cells_extend(dst: Cells, src: Cells) { - let orig_dst = dst.clone(); - let orig_src = src.clone(); - - let mut dst = dst; - dst.extend(src); - - for (fname, data) in dst.fields().iter() { - let orig_dst = orig_dst.fields().get(fname).unwrap(); - let orig_src = orig_src.fields().get(fname).unwrap(); - - typed_field_data_go!(data, ref dst, { - assert_eq!( - *orig_dst, - FieldData::from(dst[0..orig_dst.len()].to_vec()) - ); - assert_eq!( - *orig_src, - FieldData::from(dst[orig_dst.len()..dst.len()].to_vec()) - ); - assert_eq!(dst.len(), orig_dst.len() + orig_src.len()); - }); - } - - // all Cells involved should have same set of fields - assert_eq!(orig_dst.fields.len(), dst.fields.len()); - assert_eq!(orig_src.fields.len(), dst.fields.len()); - } - - fn do_cells_sort(cells: Cells, keys: Vec) { - let cells_sorted = cells.sorted(keys.as_slice()); - assert!(cells_sorted.is_sorted(keys.as_slice())); - - assert_eq!(cells.fields().len(), cells_sorted.fields().len()); - - if cells.is_sorted(keys.as_slice()) { - // running the sort should not have changed anything - assert_eq!(cells, cells_sorted); - } - - /* - * We want to verify that the contents of the records are the - * same before and after the sort. We can precisely do that - * with a hash join, though it's definitely tricky to turn - * the columnar data into rows, or we can approximate it - * by sorting and comparing each column, which is not fully - * precise but way easier. - */ - for (fname, data) in cells.fields().iter() { - let Some(data_sorted) = cells_sorted.fields().get(fname) else { - unreachable!() - }; - - let orig_sorted = { - let mut orig = data.clone(); - orig.sort(); - orig - }; - let sorted_sorted = { - let mut sorted = data_sorted.clone(); - sorted.sort(); - sorted - }; - assert_eq!(orig_sorted, sorted_sorted); - } - } - - fn do_cells_slice_1d(cells: Cells, slice: Range) { - let cells = StructuredCells::new(vec![cells.len()], cells); - let sliced = cells.slice(vec![slice.clone()]).into_inner(); - let cells = cells.into_inner(); - - assert_eq!(cells.fields().len(), sliced.fields().len()); - - for (key, value) in cells.fields().iter() { - let Some(sliced) = sliced.fields().get(key) else { - unreachable!() - }; - assert_eq!( - value.slice(slice.start, slice.end - slice.start), - *sliced - ); - } - } - - fn do_cells_slice_2d( - cells: Cells, - d1: usize, - d2: usize, - s1: Range, - s2: Range, - ) { - let mut cells = cells; - cells.truncate(d1 * d2); - - let cells = StructuredCells::new(vec![d1, d2], cells); - let sliced = cells.slice(vec![s1.clone(), s2.clone()]).into_inner(); - let cells = cells.into_inner(); - - assert_eq!(cells.fields().len(), sliced.fields().len()); - - for (key, value) in cells.fields.iter() { - let Some(sliced) = sliced.fields().get(key) else { - unreachable!() - }; - - assert_eq!(s1.len() * s2.len(), sliced.len()); - - typed_field_data_cmp!( - value, - sliced, - _DT, - ref value_data, - ref sliced_data, - { - for r in s1.clone() { - let value_start = (r * d2) + s2.start; - let value_end = (r * d2) + s2.end; - let value_expect = &value_data[value_start..value_end]; - - let sliced_start = (r - s1.start) * s2.len(); - let sliced_end = (r + 1 - s1.start) * s2.len(); - let sliced_cmp = &sliced_data[sliced_start..sliced_end]; - - assert_eq!(value_expect, sliced_cmp); - } - }, - unreachable!() - ); - } - } - - fn do_cells_slice_3d( - cells: Cells, - d1: usize, - d2: usize, - d3: usize, - s1: Range, - s2: Range, - s3: Range, - ) { - let mut cells = cells; - cells.truncate(d1 * d2 * d3); - - let cells = StructuredCells::new(vec![d1, d2, d3], cells); - let sliced = cells - .slice(vec![s1.clone(), s2.clone(), s3.clone()]) - .into_inner(); - let cells = cells.into_inner(); - - assert_eq!(cells.fields().len(), sliced.fields().len()); - - for (key, value) in cells.fields.iter() { - let Some(sliced) = sliced.fields.get(key) else { - unreachable!() - }; - - assert_eq!(s1.len() * s2.len() * s3.len(), sliced.len()); - - typed_field_data_cmp!( - value, - sliced, - _DT, - ref value_data, - ref sliced_data, - { - for z in s1.clone() { - for y in s2.clone() { - let value_start = - (z * d2 * d3) + (y * d3) + s3.start; - let value_end = (z * d2 * d3) + (y * d3) + s3.end; - let value_expect = - &value_data[value_start..value_end]; - - let sliced_start = - ((z - s1.start) * s2.len() * s3.len()) - + ((y - s2.start) * s3.len()); - let sliced_end = - ((z - s1.start) * s2.len() * s3.len()) - + ((y + 1 - s2.start) * s3.len()); - let sliced_cmp = - &sliced_data[sliced_start..sliced_end]; - - assert_eq!(value_expect, sliced_cmp); - } - } - }, - unreachable!() - ); - } - } - - /// Assert that the output of [Cells::identify_groups] produces - /// correct output for the given `keys`. - fn do_cells_identify_groups(cells: Cells, keys: &[String]) { - let Some(actual) = cells.identify_groups(keys) else { - assert!(cells.is_empty()); - return; - }; - - for w in actual.windows(2) { - let (start, end) = (w[0], w[1]); - assert!(start < end); - } - - for w in actual.windows(2) { - let (start, end) = (w[0], w[1]); - for k in keys.iter() { - let f = cells.fields().get(k).unwrap(); - typed_field_data_go!(f, ref field_cells, { - for i in start..end { - assert!(field_cells[start].bits_eq(&field_cells[i])); - } - }) - } - if end < cells.len() { - let some_ne = keys.iter().any(|k| { - let f = cells.fields().get(k).unwrap(); - typed_field_data_go!(f, ref field_cells, { - field_cells[start].bits_ne(&field_cells[end]) - }) - }); - assert!(some_ne); + { + let Some(acc) = accumulated_domain.as_ref() else { + unreachable!() + }; + let nonempty = + array.nonempty_domain().unwrap().unwrap().untyped(); + assert_eq!(*acc, nonempty); } - } - assert_eq!(Some(cells.len()), actual.last().copied()); - } + /* update accumulated expected array data */ + accumulated_write.accumulate(write); + { + let acc = accumulated_write.cells().sorted(&sort_keys); - fn do_cells_count_distinct_1d(cells: Cells) { - for (key, field_cells) in cells.fields().iter() { - let expect_count = - typed_field_data_go!(field_cells, ref field_cells, { - let mut c = field_cells.clone(); - c.sort_by(|l, r| l.bits_cmp(r)); - c.dedup_by(|l, r| l.bits_eq(r)); - c.len() - }); - - let keys_for_distinct = vec![key.clone()]; - let actual_count = - cells.count_distinct(keys_for_distinct.as_slice()); - - assert_eq!(expect_count, actual_count); - } - } + let cells = { + let mut read = accumulated_write + .attach_read(ReadBuilder::new(array).unwrap()) + .unwrap() + .build(); - fn do_cells_count_distinct_2d(cells: Cells) { - let keys = cells.fields().keys().collect::>(); - - for i in 0..keys.len() { - for j in 0..keys.len() { - let expect_count = { - typed_field_data_go!( - cells.fields().get(keys[i]).unwrap(), - ref ki_cells, - { - typed_field_data_go!( - cells.fields().get(keys[j]).unwrap(), - ref kj_cells, - { - let mut unique = HashMap::new(); - - for r in 0..ki_cells.len() { - let values = match unique - .entry(BitsKeyAdapter(&ki_cells[r])) - { - Entry::Vacant(v) => { - v.insert(HashSet::new()) - } - Entry::Occupied(o) => o.into_mut(), - }; - values.insert(BitsKeyAdapter( - &kj_cells[r], - )); - } - - unique.values().flatten().count() - } - ) - } - ) + let (mut cells, _) = read.execute().unwrap(); + cells.sort(&sort_keys); + cells }; - let keys_for_distinct = vec![keys[i].clone(), keys[j].clone()]; - let actual_count = - cells.count_distinct(keys_for_distinct.as_slice()); - - assert_eq!(expect_count, actual_count); + assert_eq!(acc, cells); } - } - } - - fn do_cells_dedup(cells: Cells, keys: Vec) { - let dedup = cells.dedup(keys.as_slice()); - assert_eq!(dedup.len(), dedup.count_distinct(keys.as_slice())); - - // invariant check - for field in dedup.fields().values() { - assert_eq!(dedup.len(), field.len()); - } - if dedup.is_empty() { - assert!(cells.is_empty()); - return; - } else if dedup.len() == cells.len() { - assert_eq!(cells, dedup); - return; - } - - // check that order within the original cells is preserved - assert_eq!(cells.view(&keys, 0..1), dedup.view(&keys, 0..1)); - - let mut in_cursor = 1; - let mut out_cursor = 1; - - while in_cursor < cells.len() && out_cursor < dedup.len() { - if cells.view(&keys, in_cursor..(in_cursor + 1)) - == dedup.view(&keys, out_cursor..(out_cursor + 1)) + // safety valve to ensure we don't write two fragments in the same millisecond + if safety_write_start.elapsed() + < std::time::Duration::from_millis(1) { - out_cursor += 1; - in_cursor += 1; - } else { - in_cursor += 1; + std::thread::sleep(std::time::Duration::from_millis(1)); } } - assert_eq!(dedup.len(), out_cursor); - } - - fn do_cells_projection(cells: Cells, keys: Vec) { - let proj = cells - .projection(&keys.iter().map(|s| s.as_ref()).collect::>()) - .unwrap(); - - for key in keys.iter() { - let Some(field_in) = cells.fields().get(key) else { - unreachable!() - }; - let Some(field_out) = proj.fields().get(key) else { - unreachable!() - }; - - assert_eq!(field_in, field_out); - } - // everything in `keys` is in the projection, there should be no other fields - assert_eq!(keys.len(), proj.fields().len()); + Ok(()) } - proptest! { - #[test] - fn field_data_extend((dst, src) in (any::(), any::()).prop_flat_map(|(dt, cvn)| { - let params = FieldDataParameters { - datatype: Some(FieldStrategyDatatype::Datatype(dt, cvn)), - ..Default::default() - }; - (any_with::(params.clone()), any_with::(params.clone())) - })) { - do_field_data_extend(dst, src) - } - - #[test] - fn cells_extend((dst, src) in any::().prop_flat_map(|s| { - let params = CellsParameters { - schema: Some(CellsStrategySchema::WriteSchema(Rc::new(s))), - ..Default::default() - }; - (any_with::(params.clone()), any_with::(params.clone())) - })) { - do_cells_extend(dst, src) - } + /// Test that a single write can be read back correctly + #[test] + fn write_once_readback() -> TileDBResult<()> { + let ctx = Context::new().expect("Error creating context"); - #[test] - fn cells_sort((cells, keys) in any::().prop_flat_map(|c| { - let keys = c.fields().keys().cloned().collect::>(); - let nkeys = keys.len(); - (Just(c), proptest::sample::subsequence(keys, 0..=nkeys).prop_shuffle()) - })) { - do_cells_sort(cells, keys) - } + let schema_req = query_write_schema_requirements(None); - #[test] - fn cells_slice_1d((cells, bound1, bound2) in any::().prop_flat_map(|cells| { - let slice_min = 0; - let slice_max = cells.len(); - (Just(cells), - slice_min..=slice_max, - slice_min..=slice_max) - })) { - let start = std::cmp::min(bound1, bound2); - let end = std::cmp::max(bound1, bound2); - do_cells_slice_1d(cells, start.. end) - } + let strategy = any_with::(Rc::new(schema_req)) + .prop_flat_map(|schema| { + let schema = Rc::new(schema); + ( + Just(Rc::clone(&schema)), + any_with::(WriteParameters::default_for( + schema, + )) + .prop_map(WriteSequence::from), + ) + }); - #[test] - fn cells_slice_2d((cells, d1, d2, b11, b12, b21, b22) in any_with::(CellsParameters { - min_records: 1, - ..Default::default() - }).prop_flat_map(|cells| { - let ncells = cells.len(); - (Just(cells), - 1..=((ncells as f64).sqrt() as usize), - 1..=((ncells as f64).sqrt() as usize)) - .prop_flat_map(|(cells, d1, d2)| { - (Just(cells), - Just(d1), - Just(d2), - 0..=d1, - 0..=d1, - 0..=d2, - 0..=d2) - }) - })) { - let s1 = std::cmp::min(b11, b12).. std::cmp::max(b11, b12); - let s2 = std::cmp::min(b21, b22).. std::cmp::max(b21, b22); - do_cells_slice_2d(cells, d1, d2, s1, s2) - } + proptest!(|((schema_spec, write_sequence) in strategy)| { + do_write_readback(&ctx, schema_spec, write_sequence)?; + }); - #[test] - fn cells_slice_3d((cells, d1, d2, d3, b11, b12, b21, b22, b31, b32) in any_with::(CellsParameters { - min_records: 1, - ..Default::default() - }).prop_flat_map(|cells| { - let ncells = cells.len(); - (Just(cells), - 1..=((ncells as f64).cbrt() as usize), - 1..=((ncells as f64).cbrt() as usize), - 1..=((ncells as f64).cbrt() as usize)) - .prop_flat_map(|(cells, d1, d2, d3)| { - (Just(cells), - Just(d1), - Just(d2), - Just(d3), - 0..=d1, - 0..=d1, - 0..=d2, - 0..=d2, - 0..=d3, - 0..=d3) - }) - })) { - let s1 = std::cmp::min(b11, b12).. std::cmp::max(b11, b12); - let s2 = std::cmp::min(b21, b22).. std::cmp::max(b21, b22); - let s3 = std::cmp::min(b31, b32).. std::cmp::max(b31, b32); - do_cells_slice_3d(cells, d1, d2, d3, s1, s2, s3) - } + Ok(()) + } - #[test] - fn cells_identify_groups((cells, keys) in any::().prop_flat_map(|c| { - let keys = c.fields().keys().cloned().collect::>(); - let nkeys = keys.len(); - (Just(c), proptest::sample::subsequence(keys, 0..=nkeys)) - })) - { - do_cells_identify_groups(cells, &keys) - } + /// Test that each write in the sequence can be read back correctly at the right timestamp + #[test] + fn write_sequence_readback() -> TileDBResult<()> { + let ctx = Context::new().expect("Error creating context"); - #[test] - fn cells_count_distinct_1d(cells in any::()) { - do_cells_count_distinct_1d(cells) - } + let schema_req = + query_write_schema_requirements(Some(ArrayType::Sparse)); - #[test] - fn cells_count_distinct_2d(cells in any::()) { - prop_assume!(cells.fields().len() >= 2); - do_cells_count_distinct_2d(cells) - } + let strategy = any_with::(Rc::new(schema_req)) + .prop_flat_map(|schema| { + let schema = Rc::new(schema); + ( + Just(Rc::clone(&schema)), + any_with::( + WriteSequenceParameters::default_for(Rc::clone( + &schema, + )), + ), + ) + }); - #[test] - fn cells_dedup((cells, keys) in any::().prop_flat_map(|c| { - let keys = c.fields().keys().cloned().collect::>(); - let nkeys = keys.len(); - (Just(c), proptest::sample::subsequence(keys, 0..=nkeys).prop_shuffle()) - })) - { - do_cells_dedup(cells, keys) - } + proptest!(|((schema_spec, write_sequence) in strategy)| { + do_write_readback(&ctx, schema_spec, write_sequence)?; + }); - #[test] - fn cells_projection((cells, keys) in any::().prop_flat_map(|c| { - let keys = c.fields().keys().cloned().collect::>(); - let nkeys = keys.len(); - (Just(c), proptest::sample::subsequence(keys, 0..=nkeys).prop_shuffle()) - })) { - do_cells_projection(cells, keys) - } + Ok(()) } } diff --git a/tiledb/api/src/query/subarray.rs b/tiledb/api/src/query/subarray.rs index e6aec39f..84a97589 100644 --- a/tiledb/api/src/query/subarray.rs +++ b/tiledb/api/src/query/subarray.rs @@ -2,17 +2,19 @@ use std::marker::PhantomData; use std::ops::Deref; use anyhow::anyhow; -use itertools::Itertools; use crate::array::Schema; use crate::context::{CApiInterface, Context, ContextBound}; use crate::datatype::PhysicalType; -use crate::error::{DatatypeErrorKind, Error}; +use crate::error::{DatatypeError, Error}; use crate::key::LookupKey; use crate::query::QueryBuilder; use crate::range::{Range, SingleValueRange, TypedRange, VarValueRange}; use crate::Result as TileDBResult; -use crate::{physical_type_go, single_value_range_go, var_value_range_go}; + +use tiledb_common::{ + physical_type_go, single_value_range_go, var_value_range_go, +}; pub(crate) enum RawSubarray { Owned(*mut ffi::tiledb_subarray_t), @@ -313,10 +315,7 @@ where let dim = schema.domain()?.dimension(dim_idx)?; let dtype = dim.datatype()?; if dtype.is_compatible_type::() { - return Err(Error::Datatype(DatatypeErrorKind::TypeMismatch { - user_type: std::any::type_name::().to_owned(), - tiledb_type: dtype, - })); + Err(DatatypeError::physical_type_incompatible::(dtype))?; } if points.is_empty() { @@ -371,200 +370,6 @@ where } } -/// Encapsulates data for a subarray. -#[derive(Clone, Debug, Eq, Hash, PartialEq)] -pub struct SubarrayData { - /// List of requested ranges on each dimension. - /// The outer `Vec` is the list of dimensions and the inner `Vec` - /// is the list of requested ranges for that dimension. - /// If a list is empty for a dimension, then all the coordinates - /// of that dimension are selected. - pub dimension_ranges: Vec>, -} - -impl SubarrayData { - /// Returns a new `SubarrayData` which represents the intersection - /// of all the ranges of `self` with a new set of `ranges` on each dimension. - /// - /// If any dimension does not have any intersection with `ranges`, then - /// this returns `None` as the resulting subarray would select no coordinates. - pub fn intersect_ranges(&self, ranges: &[Range]) -> Option { - let updated_ranges = self - .dimension_ranges - .iter() - .zip(ranges.iter()) - .map(|(current_ranges, new_range)| { - if current_ranges.is_empty() { - // empty means select the whole thing - vec![new_range.clone()] - } else { - current_ranges - .iter() - .filter_map(|current_range| { - current_range.intersection(new_range) - }) - .collect::>() - } - }) - .collect::>>(); - - if updated_ranges.iter().any(|dim| dim.is_empty()) { - None - } else { - Some(SubarrayData { - dimension_ranges: updated_ranges, - }) - } - } - - /// Returns a new `SubarrayData` which represents the intersection - /// of all the ranges of `self` with all of the ranges of `other` on each dimension. - /// - /// ``` - /// use tiledb::query::subarray::SubarrayData; - /// use tiledb::range::Range; - /// - /// let s1 = SubarrayData { - /// dimension_ranges: vec![ - /// vec![Range::from(&[0, 100]), Range::from(&[200, 300])], - /// vec![Range::from(&[2, 6]), Range::from(&[8, 12])], - /// vec![Range::from(&[20, 30]), Range::from(&[40, 50])] - /// ] - /// }; - /// let s2 = SubarrayData { - /// dimension_ranges: vec![ - /// vec![Range::from(&[150, 250])], - /// vec![Range::from(&[4, 10]), Range::from(&[12, 12])], - /// vec![Range::from(&[25, 45])] - /// ] - /// }; - /// let intersection = s1.intersect(&s2); - /// - /// assert_eq!(intersection, Some(SubarrayData { - /// dimension_ranges: vec![ - /// vec![Range::from(&[200, 250])], - /// vec![Range::from(&[4, 6]), Range::from(&[8, 10]), Range::from(&[12, 12])], - /// vec![Range::from(&[25, 30]), Range::from(&[40, 45])] - /// ] - /// })); - /// ``` - /// - /// If any dimension does not have any intersection, then this returns `None` - /// as the resulting subarray would select no coordinates. - /// ``` - /// use tiledb::query::subarray::SubarrayData; - /// use tiledb::range::Range; - /// - /// let s1 = SubarrayData { - /// dimension_ranges: vec![ - /// vec![Range::from(&[50, 100]), Range::from(&[400, 450])] - /// ] - /// }; - /// let s2 = SubarrayData { - /// dimension_ranges: vec![ - /// vec![Range::from(&[150, 250]), Range::from(&[300, 350])], - /// ] - /// }; - /// let intersection = s1.intersect(&s2); - /// assert_eq!(intersection, None); - /// ``` - /// - /// If a dimension in `self` (without loss of generality) has no ranges, - /// then it is a special case which means to select the all coordinates. - /// The intersection is equal to the ranges of `other`. - /// ``` - /// use tiledb::query::subarray::SubarrayData; - /// use tiledb::range::Range; - /// - /// let s1 = SubarrayData { - /// dimension_ranges: vec![ - /// vec![] - /// ] - /// }; - /// let s2 = SubarrayData { - /// dimension_ranges: vec![ - /// vec![Range::from(&[150, 250]), Range::from(&[300, 350])], - /// ] - /// }; - /// let intersection = s1.intersect(&s2); - /// assert_eq!(intersection, Some(s2.clone())); - /// ``` - pub fn intersect(&self, other: &SubarrayData) -> Option { - let updated_ranges = self - .dimension_ranges - .iter() - .zip(other.dimension_ranges.iter()) - .map(|(my_dimension, their_dimension)| { - if my_dimension.is_empty() { - // empty means select all coordinates - their_dimension.clone() - } else if their_dimension.is_empty() { - // empty means select all coordinates - my_dimension.clone() - } else { - my_dimension - .iter() - .cartesian_product(their_dimension.iter()) - .filter_map(|(rm, rt)| rm.intersection(rt)) - .collect::>() - } - }) - .collect::>>(); - - if updated_ranges.iter().any(|dim| dim.is_empty()) { - None - } else { - Some(SubarrayData { - dimension_ranges: updated_ranges, - }) - } - } -} - -#[cfg(any(test, feature = "proptest-strategies"))] -pub mod strategy { - use std::rc::Rc; - - use proptest::prelude::*; - - use super::*; - use crate::array::SchemaData; - - impl Arbitrary for SubarrayData { - type Parameters = Option>; - type Strategy = BoxedStrategy; - - fn arbitrary_with(params: Self::Parameters) -> Self::Strategy { - let strat_dimension_ranges = if let Some(schema) = params { - schema - .domain - .dimension - .iter() - .map(|d| d.subarray_strategy(None).unwrap()) - .collect::>>() - } else { - todo!() - }; - - const DIMENSION_MIN_RANGES: usize = 0; - const DIMENSION_MAX_RANGES: usize = 4; - - strat_dimension_ranges - .into_iter() - .map(|strat_range| { - proptest::collection::vec( - strat_range, - DIMENSION_MIN_RANGES..=DIMENSION_MAX_RANGES, - ) - .boxed() - }) - .collect::>>>() - .prop_map(|dimension_ranges| SubarrayData { dimension_ranges }) - .boxed() - } - } -} - #[cfg(test)] mod tests { use std::hash::{DefaultHasher, Hash, Hasher}; @@ -572,7 +377,9 @@ mod tests { use itertools::izip; use proptest::prelude::*; - use tiledb_test_utils::{self, TestArrayUri}; + use tiledb_pod::array::schema::SchemaData; + use tiledb_pod::query::subarray::SubarrayData; + use uri::{self, TestArrayUri}; use super::*; use crate::array::*; @@ -588,7 +395,7 @@ mod tests { fn default_subarray_constrained() -> TileDBResult<()> { let ctx = Context::new().unwrap(); - let test_uri = tiledb_test_utils::get_uri_generator() + let test_uri = uri::get_uri_generator() .map_err(|e| Error::Other(e.to_string()))?; let test_uri = crate::array::tests::create_quickstart_dense(&test_uri, &ctx)?; @@ -636,7 +443,7 @@ mod tests { fn default_subarray_unconstrained() -> TileDBResult<()> { let ctx = Context::new().unwrap(); - let test_uri = tiledb_test_utils::get_uri_generator() + let test_uri = uri::get_uri_generator() .map_err(|e| Error::Other(e.to_string()))?; let test_uri = crate::array::tests::create_quickstart_sparse_string( &test_uri, &ctx, @@ -685,7 +492,7 @@ mod tests { #[test] fn test_dense_ranges() -> TileDBResult<()> { let ctx = Context::new().unwrap(); - let test_uri = tiledb_test_utils::get_uri_generator() + let test_uri = uri::get_uri_generator() .map_err(|e| Error::Other(e.to_string()))?; test_ranges(&ctx, ArrayType::Dense, &test_uri) } @@ -693,7 +500,7 @@ mod tests { #[test] fn test_sparse_ranges() -> TileDBResult<()> { let ctx = Context::new().unwrap(); - let test_uri = tiledb_test_utils::get_uri_generator() + let test_uri = uri::get_uri_generator() .map_err(|e| Error::Other(e.to_string()))?; test_ranges(&ctx, ArrayType::Sparse, &test_uri) } @@ -767,184 +574,11 @@ mod tests { Ok(array_uri) } - fn do_subarray_intersect_ranges(subarray: &SubarrayData, ranges: &[Range]) { - if let Some(intersection) = subarray.intersect_ranges(ranges) { - assert_eq!( - subarray.dimension_ranges.len(), - intersection.dimension_ranges.len() - ); - assert_eq!(subarray.dimension_ranges.len(), ranges.len()); - - for (before, after, update) in izip!( - subarray.dimension_ranges.iter(), - intersection.dimension_ranges.iter(), - ranges.iter() - ) { - if before.is_empty() { - assert_eq!(vec![update.clone()], *after); - continue; - } - - assert!(after.len() <= before.len()); - - let mut r_after = after.iter(); - for r_before in before.iter() { - if let Some(r) = r_before.intersection(update) { - assert_eq!(*r_after.next().unwrap(), r); - } - } - assert_eq!(None, r_after.next()); - } - } else { - // for at least one dimension, none of the ranges could have intersected - let found_empty_intersection = subarray - .dimension_ranges - .iter() - .zip(ranges.iter()) - .any(|(current, new)| { - if current.is_empty() { - false - } else { - current.iter().all(|r| r.intersection(new).is_none()) - } - }); - assert!( - found_empty_intersection, - "dimensions: {:?}", - subarray - .dimension_ranges - .iter() - .zip(ranges.iter()) - .map(|(d, r)| format!( - "({:?} && {:?} = {:?}", - d, - r, - d.iter() - .map(|dr| dr.intersection(r)) - .collect::>>() - )) - .collect::>() - ); - } - } - - /// Validate the intersection of two subarrays. - /// `s1` and `s2` are two subarrays for the same schema. - fn do_subarray_intersect_subarray(s1: &SubarrayData, s2: &SubarrayData) { - if let Some(intersection) = s1.intersect(s2) { - for (di, ds1, ds2) in izip!( - intersection.dimension_ranges.iter(), - s1.dimension_ranges.iter(), - s2.dimension_ranges.iter(), - ) { - if ds1.is_empty() { - assert_eq!(di, ds2); - continue; - } else if ds2.is_empty() { - assert_eq!(di, ds1); - continue; - } - // there must be some pair from (rs1, rs2) where di is the intersection - for ri in di.iter() { - let found_input = ds1 - .iter() - .cartesian_product(ds2.iter()) - .any(|(rs1, rs2)| { - Some(ri) == rs1.intersection(rs2).as_ref() - }); - assert!(found_input, "ri = {:?}", ri); - } - - // and for all pairs (rs1, rs2), there must be some ri which covers - for (rs1, rs2) in ds1.iter().cartesian_product(ds2.iter()) { - let Some(intersection) = rs1.intersection(rs2) else { - continue; - }; - - let found_output = di.iter().any(|ri| intersection == *ri); - assert!( - found_output, - "rs1 = {:?}, rs2 = {:?}, intersection = {:?}", - rs1, rs2, intersection - ); - } - } - } else { - // for each least one dimension, none of the ranges of `s1` - // intersected with any range from `s2` - let found_empty_intersection = s1 - .dimension_ranges - .iter() - .zip(s2.dimension_ranges.iter()) - .any(|(ds1, ds2)| { - ds1.iter() - .cartesian_product(ds2.iter()) - .all(|(rs1, rs2)| rs1.intersection(rs2).is_none()) - }); - assert!(found_empty_intersection); - } - } - - fn strat_subarray_intersect_ranges( - ) -> impl Strategy)> { - use crate::array::domain::strategy::Requirements as DomainRequirements; - use crate::array::schema::strategy::Requirements as SchemaRequirements; - - let req = Rc::new(SchemaRequirements { - domain: Some(Rc::new(DomainRequirements { - num_dimensions: 1..=1, - ..Default::default() - })), - ..Default::default() - }); - - any_with::(req).prop_flat_map(|schema| { - let schema = Rc::new(schema); - ( - any_with::(Some(Rc::clone(&schema))), - schema.domain.subarray_strategy(), - ) - }) - } - - fn strat_subarray_intersect_subarray( - ) -> impl Strategy { - use crate::array::domain::strategy::Requirements as DomainRequirements; - use crate::array::schema::strategy::Requirements as SchemaRequirements; - - let req = Rc::new(SchemaRequirements { - domain: Some(Rc::new(DomainRequirements { - num_dimensions: 1..=1, - ..Default::default() - })), - ..Default::default() - }); - - any_with::(req).prop_flat_map(|schema| { - let schema = Rc::new(schema); - let strat_subarray = - any_with::(Some(Rc::clone(&schema))); - (strat_subarray.clone(), strat_subarray.clone()) - }) - } - - proptest! { - #[test] - fn subarray_intersect_ranges((subarray, range) in strat_subarray_intersect_ranges()) { - do_subarray_intersect_ranges(&subarray, &range) - } - - #[test] - fn subarray_intersect_subarray((s1, s2) in strat_subarray_intersect_subarray()) { - do_subarray_intersect_subarray(&s1, &s2) - } - } - #[test] fn dimension_ranges() { let ctx = Context::new().unwrap(); - let test_uri = tiledb_test_utils::get_uri_generator() + let test_uri = uri::get_uri_generator() .map_err(|e| Error::Other(e.to_string())) .unwrap(); let test_uri = crate::array::tests::create_quickstart_sparse_string( @@ -992,7 +626,6 @@ mod tests { let array = Array::open(&ctx, &test_uri, Mode::Read).unwrap(); Rc::new(SchemaData::try_from(array.schema().unwrap()).unwrap()) }; - let do_dimension_ranges = |subarray: SubarrayData| -> TileDBResult<()> { let array = Array::open(&ctx, &test_uri, Mode::Read).unwrap(); let mut q = ReadBuilder::new(array)? @@ -1047,7 +680,6 @@ mod tests { let expect_num_cells = num_cells_0 * num_cells_1; assert_eq!(expect_num_cells, rows.len()); } - for (row, col, att) in izip!(rows, cols, atts) { assert_eq!(att, derive_att(&row, &col)); @@ -1065,7 +697,7 @@ mod tests { let col_in_bounds = subarray.dimension_ranges[1].is_empty() || subarray.dimension_ranges[1].iter().any(|r| { - let Range::Single(SingleValueRange::Int32(lb, ub)) = r + let Range::Single(SingleValueRange::Int32(lb, ub)) = &r else { unreachable!() }; diff --git a/tiledb/api/src/query/write/input/arrow.rs b/tiledb/api/src/query/write/input/arrow.rs index bd60f0c7..3b7a70ab 100644 --- a/tiledb/api/src/query/write/input/arrow.rs +++ b/tiledb/api/src/query/write/input/arrow.rs @@ -10,16 +10,17 @@ use arrow::array::{ }; use arrow::buffer::OffsetBuffer; use arrow::datatypes::{ArrowPrimitiveType, Field}; +use tiledb_common::array::CellValNum; -use crate::array::{CellValNum, Schema}; -use crate::datatype::PhysicalType; -use crate::error::{DatatypeErrorKind, Error}; +use crate::array::Schema; +use crate::error::{DatatypeError, Error}; use crate::query::buffer::{ Buffer, CellStructure, QueryBuffers, TypedQueryBuffers, }; use crate::query::write::input::{ DataProvider, RecordProvider, TypedDataProvider, }; +use crate::query::CellValue; use crate::Result as TileDBResult; fn cell_structure_var( @@ -31,15 +32,10 @@ fn cell_structure_var( let expect_len = nz.get() as i64; for window in offsets.windows(2) { if window[1] - window[0] != expect_len { - return Err(Error::Datatype( - DatatypeErrorKind::UnexpectedCellStructure { - context: Some( - "Arrow array as query input".to_owned(), - ), - expected: cell_val_num, - found: CellValNum::Var, - }, - )); + return Err(Error::UnexpectedCellStructure { + expected: cell_val_num, + found: CellValNum::Var, + }); } } Ok(CellStructure::Fixed(nz)) @@ -56,16 +52,15 @@ fn cell_structure_fixed( cell_val_num: CellValNum, ) -> TileDBResult> { match cell_val_num { - CellValNum::Fixed(nz) if fixed_len as u32 != nz.get() => Err( - Error::Datatype(DatatypeErrorKind::UnexpectedCellStructure { - context: Some("Arrow array as query input".to_owned()), + CellValNum::Fixed(nz) if fixed_len as u32 != nz.get() => { + Err(Error::UnexpectedCellStructure { expected: cell_val_num, found: match NonZeroU32::new(fixed_len as u32) { Some(nz) => CellValNum::Fixed(nz), None => CellValNum::Var, }, - }), - ), + }) + } CellValNum::Fixed(nz) => Ok(CellStructure::Fixed(nz)), CellValNum::Var => { let offsets = Buffer::Owned({ @@ -97,11 +92,7 @@ where if nulls.null_count() == 0 { None } else { - return Err(Error::Datatype( - DatatypeErrorKind::UnexpectedValidity { - context: Some("Arrow array as query input".to_owned()), - }, - )); + return Err(Error::UnexpectedValidity); } } else { None @@ -116,15 +107,13 @@ fn apply_to_list_element_impl<'data, A>( ) -> TileDBResult> where A: ArrowPrimitiveType, - ::Native: PhysicalType, + ::Native: CellValue, TypedQueryBuffers<'data>: From as DataProvider>::Unit>>, { if elements.nulls().is_some() && elements.nulls().unwrap().null_count() > 0 { - return Err(Error::Datatype(DatatypeErrorKind::UnexpectedValidity { - context: Some("Arrow array list element".to_owned()), - })); + return Err(Error::UnexpectedValidity); } let data = Buffer::Borrowed(elements.values().as_ref()); @@ -256,7 +245,7 @@ fn apply_to_list_element<'data>( impl DataProvider for PrimitiveArray where A: ArrowPrimitiveType, - ::Native: PhysicalType, + ::Native: CellValue, { type Unit = ::Native; @@ -279,23 +268,14 @@ where } CellValNum::Fixed(nz) => { if self.values().len() % nz.get() as usize == 0 { - return Err(Error::Datatype( - DatatypeErrorKind::UnexpectedCellStructure { - context: None, - found: CellValNum::Fixed(nz), - expected: CellValNum::single(), - }, - )); + return Err(Error::UnexpectedCellStructure { + found: CellValNum::Fixed(nz), + expected: CellValNum::single(), + }); } if self.nulls().map(|n| n.null_count() > 0).unwrap_or(false) { - return Err(Error::Datatype( - DatatypeErrorKind::UnexpectedValidity { - context: Some( - "Arrow array list element".to_owned(), - ), - }, - )); + return Err(Error::UnexpectedValidity); } Ok(QueryBuffers { @@ -304,13 +284,10 @@ where validity: None, }) } - CellValNum::Var => Err(Error::Datatype( - DatatypeErrorKind::UnexpectedCellStructure { - context: None, - found: CellValNum::Var, - expected: CellValNum::single(), - }, - )), + CellValNum::Var => Err(Error::UnexpectedCellStructure { + found: CellValNum::Var, + expected: CellValNum::single(), + }), } } } @@ -568,7 +545,8 @@ impl<'data> Iterator for RecordBatchTileDBInputs<'data> { (None, None) => None, (Some(f), Some(c)) => { let Some((datatype, cell_val_num)) = - crate::datatype::arrow::from_arrow(f.data_type()).ok() + tiledb_common::datatype::arrow::from_arrow(f.data_type()) + .ok() else { return Some(Err(Error::InvalidArgument(anyhow!( format!( @@ -588,10 +566,9 @@ impl<'data> Iterator for RecordBatchTileDBInputs<'data> { }; if datatype != field_datatype { return Some(Err(Error::Datatype( - DatatypeErrorKind::InvalidDatatype { - context: Some(f.name().clone()), - found: datatype, - expected: field_datatype, + DatatypeError::LogicalTypeMismatch { + source_type: datatype, + target_type: field_datatype, }, ))); } @@ -601,13 +578,10 @@ impl<'data> Iterator for RecordBatchTileDBInputs<'data> { }; if cell_val_num != field_cell_val_num { /* TODO: we can be more flexible, e.g. fixed size list can go to Var */ - return Some(Err(Error::Datatype( - DatatypeErrorKind::UnexpectedCellStructure { - context: Some(f.name().clone()), - found: cell_val_num, - expected: field_cell_val_num, - }, - ))); + return Some(Err(Error::UnexpectedCellStructure { + found: cell_val_num, + expected: field_cell_val_num, + })); } let field_is_nullable = match tiledb_field.nullability() { Ok(is_nullable) => is_nullable, diff --git a/tiledb/api/src/query/write/input/mod.rs b/tiledb/api/src/query/write/input/mod.rs index 03f9dfab..59d18c87 100644 --- a/tiledb/api/src/query/write/input/mod.rs +++ b/tiledb/api/src/query/write/input/mod.rs @@ -2,18 +2,18 @@ use std::num::NonZeroU32; use std::rc::Rc; use crate::array::{CellValNum, Schema}; -use crate::datatype::PhysicalType; -use crate::error::{DatatypeErrorKind, Error}; +use crate::error::Error; use crate::query::buffer::{ Buffer, CellStructure, QueryBuffers, QueryBuffersMut, TypedQueryBuffers, }; +use crate::query::CellValue; use crate::Result as TileDBResult; #[cfg(feature = "arrow")] pub mod arrow; pub trait DataProvider { - type Unit: PhysicalType; + type Unit: CellValue; fn query_buffers( &self, @@ -52,7 +52,7 @@ where impl DataProvider for QueryBuffers<'_, C> where - C: PhysicalType, + C: CellValue, { type Unit = C; @@ -76,7 +76,7 @@ where impl DataProvider for QueryBuffersMut<'_, C> where - C: PhysicalType, + C: CellValue, { type Unit = C; @@ -106,13 +106,13 @@ where // some common impl logic inside of private functions. // Maybe we could make an adapter type in the future. trait AsSlice { - type Item: PhysicalType; + type Item: CellValue; fn values(&self) -> &[Self::Item]; } impl AsSlice for Vec where - T: PhysicalType, + T: CellValue, { type Item = T; fn values(&self) -> &[Self::Item] { @@ -122,7 +122,7 @@ where impl AsSlice for [T] where - T: PhysicalType, + T: CellValue, { type Item = T; fn values(&self) -> &[Self::Item] { @@ -158,13 +158,10 @@ where let expect_len = nz.get() as usize; for cell in items.iter() { if cell.values().len() != expect_len { - return Err(Error::Datatype( - DatatypeErrorKind::UnexpectedCellStructure { - context: None, - expected: CellValNum::Fixed(nz), - found: CellValNum::Var, - }, - )); + return Err(Error::UnexpectedCellStructure { + expected: CellValNum::Fixed(nz), + found: CellValNum::Var, + }); } } Ok(CellStructure::Fixed(nz)) @@ -186,7 +183,7 @@ where /// Helper function to implement `DataProvider::query_buffers` /// for types which resemble a nested slice. // (Without negative trait bounds we can't provide separate DataProvider -// impls for `Vec where C: PhysicalType` and `Vec where S: AsSlice`) +// impls for `Vec where C: CellValue` and `Vec where S: AsSlice`) fn query_buffers_impl( value: &[S], cell_val_num: CellValNum, @@ -221,7 +218,7 @@ where impl DataProvider for Vec where - C: PhysicalType, + C: CellValue, { type Unit = C; @@ -236,7 +233,7 @@ where impl DataProvider for [C] where - C: PhysicalType, + C: CellValue, { type Unit = C; @@ -261,7 +258,7 @@ where impl DataProvider for Vec> where - C: PhysicalType, + C: CellValue, { type Unit = C; @@ -375,7 +372,7 @@ mod tests { #[test] fn input_provider_strings( - stringvec in crate::query::buffer::strategy::prop_string_vec( + stringvec in crate::query::buffer::tests::prop_string_vec( (MIN_RECORDS..=MAX_RECORDS).into() ) ) { diff --git a/tiledb/api/src/query/write/mod.rs b/tiledb/api/src/query/write/mod.rs index 3f23420f..dd7e00df 100644 --- a/tiledb/api/src/query/write/mod.rs +++ b/tiledb/api/src/query/write/mod.rs @@ -249,6 +249,3 @@ impl<'data> WriteBuilder<'data> { Ok(b) } } - -#[cfg(any(test, feature = "proptest-strategies"))] -pub mod strategy; diff --git a/tiledb/api/src/query/write/strategy.rs b/tiledb/api/src/query/write/strategy.rs deleted file mode 100644 index 41f425b0..00000000 --- a/tiledb/api/src/query/write/strategy.rs +++ /dev/null @@ -1,1639 +0,0 @@ -use std::fmt::{Debug, Formatter, Result as FmtResult}; -use std::ops::{Deref, RangeInclusive}; -use std::rc::Rc; - -use proptest::prelude::*; -use proptest::strategy::{NewTree, ValueTree}; -use proptest::test_runner::TestRunner; -use serde_json::json; - -use crate::array::{ArrayType, CellOrder, CellValNum, SchemaData}; -use crate::datatype::physical::BitsOrd; -use crate::filter::strategy::Requirements as FilterRequirements; -use crate::query::read::{CallbackVarArgReadBuilder, MapAdapter}; -use crate::query::strategy::{ - Cells, CellsConstructor, CellsParameters, CellsStrategySchema, - FieldDataParameters, RawResultCallback, StructuredCells, -}; -use crate::query::{QueryBuilder, ReadQueryBuilder, WriteBuilder}; -use crate::range::{NonEmptyDomain, Range, SingleValueRange}; -use crate::{ - single_value_range_go, typed_field_data_go, Result as TileDBResult, -}; - -type BoxedValueTree = Box>; - -/// Returns a base set of requirements for filters to be used -/// in write queries. -/// -/// Requirements are chosen to either avoid -/// constraints on input (e.g. positive delta filtering requires -/// sorted input, float scale filtering is not invertible) -/// or to avoid issues in the tiledb core library in as -/// many scenarios as possible. -// now that we're actually writing data we will hit the fun bugs. -// there are several in the filter pipeline, so we must heavily -// restrict what is allowed until the bugs are fixed. -pub fn query_write_filter_requirements() -> FilterRequirements { - FilterRequirements { - allow_bit_reduction: false, // SC-47560 - allow_bit_shuffle: false, // SC-48409 - allow_byte_shuffle: false, // SC-48409 - allow_positive_delta: false, // nothing yet to ensure sort order - allow_scale_float: false, // not invertible due to precision loss - allow_xor: false, // SC-47328 - allow_compression_rle: false, // probably can be enabled but nontrivial - allow_compression_dict: false, // probably can be enabled but nontrivial - allow_compression_delta: false, // SC-47328 - allow_webp: false, // SC-51250 - ..Default::default() - } -} - -/// Returns a base set of schema requirements for running a query. -/// -/// Requirements are chosen to either avoid constraints on write input -/// or to avoid issues in the tiledb core library in as many scenarios as possible. -pub fn query_write_schema_requirements( - array_type: Option, -) -> crate::array::schema::strategy::Requirements { - // NB: 1 is the highest number that passes all cases (so don't use the value given by - // `DomainRequirements::default()`) but we want to enable environmental override. - use crate::array::domain::strategy::Requirements as DomainRequirements; - let env_max_dimensions = - DomainRequirements::env_max_dimensions().unwrap_or(1); - - crate::array::schema::strategy::Requirements { - domain: Some(Rc::new(crate::array::domain::strategy::Requirements { - array_type, - num_dimensions: 1..=env_max_dimensions, - dimension: Some(crate::array::dimension::strategy::Requirements { - filters: Some(Rc::new(query_write_filter_requirements())), - ..Default::default() - }), - ..Default::default() - })), - attribute_filters: Some(Rc::new(query_write_filter_requirements())), - coordinates_filters: Some(Rc::new(query_write_filter_requirements())), - offsets_filters: Some(Rc::new(query_write_filter_requirements())), - validity_filters: Some(Rc::new(query_write_filter_requirements())), - ..Default::default() - } -} - -#[derive(Clone, Debug)] -pub struct DenseWriteInput { - pub layout: CellOrder, - pub data: Cells, - pub subarray: Vec, -} - -impl DenseWriteInput { - /// Prepares a write query to insert data from this write. - pub fn attach_write<'data>( - &'data self, - b: WriteBuilder<'data>, - ) -> TileDBResult> { - let mut subarray = self.data.attach_write(b)?.start_subarray()?; - - for i in 0..self.subarray.len() { - subarray = subarray.add_range(i, self.subarray[i].clone())?; - } - - subarray.finish_subarray()?.layout(self.layout) - } - - /// Prepares a read query to read the fields written by this operation - /// restricted to the subarray represented by this write. - pub fn attach_read<'data, B>( - &'data self, - b: B, - ) -> TileDBResult< - CallbackVarArgReadBuilder< - 'data, - MapAdapter, - B, - >, - > - where - B: ReadQueryBuilder<'data>, - { - let mut subarray = b.start_subarray()?; - - for i in 0..self.subarray.len() { - subarray = subarray.add_range(i, self.subarray[i].clone())?; - } - - let b: B = subarray.finish_subarray()?.layout(self.layout)?; - - Ok(self.data.attach_read(b)?.map(CellsConstructor::new())) - } -} - -#[derive(Clone, Debug)] -pub struct DenseWriteParameters { - pub schema: Option>, - pub layout: Option, - pub memory_limit: usize, -} - -impl DenseWriteParameters { - pub fn memory_limit_default() -> usize { - **crate::strategy::config::TILEDB_STRATEGY_DENSE_WRITE_PARAMETERS_MEMORY_LIMIT - } -} - -impl Default for DenseWriteParameters { - fn default() -> Self { - DenseWriteParameters { - schema: Default::default(), - layout: Default::default(), - memory_limit: Self::memory_limit_default(), - } - } -} - -pub struct DenseWriteValueTree { - layout: CellOrder, - field_order: Vec, - bounding_subarray: Vec>, - subarray: Vec>, - cells: StructuredCells, - prev_shrink: Option, -} - -impl DenseWriteValueTree { - pub fn new( - layout: CellOrder, - bounding_subarray: Vec, - subarray: Vec>, - cells: Cells, - ) -> Self { - let field_order = - cells.fields().keys().cloned().collect::>(); - - let cells = { - let dimension_len = bounding_subarray - .iter() - .map(|r| { - usize::try_from(r.num_cells().unwrap()) - .expect("Too many cells to fit in memory") - }) - .collect::>(); - StructuredCells::new(dimension_len, cells) - }; - - let bounding_subarray = bounding_subarray - .into_iter() - .map(|range| { - let r = RangeInclusive::::try_from(range).unwrap(); - assert!(r.start() <= r.end()); - r - }) - .collect::>>(); - - DenseWriteValueTree { - layout, - field_order, - bounding_subarray, - subarray, - cells, - prev_shrink: None, - } - } - - fn subarray_current(&self) -> Vec { - self.subarray - .iter() - .map(|tree| tree.current()) - .collect::>() - } - - fn cells_for_subarray( - &self, - subarray: &[SingleValueRange], - ) -> StructuredCells { - let slices = self - .bounding_subarray - .iter() - .zip(subarray.iter()) - .map(|(complete, current)| { - let current = - RangeInclusive::::try_from(current.clone()).unwrap(); - - assert!(current.start() <= current.end()); - - assert!( - complete.start() <= current.start(), - "complete = {:?}, current = {:?}", - complete, - current - ); - assert!( - current.end() <= complete.end(), - "complete = {:?}, current = {:?}", - complete, - current - ); - - let start = current.start() - complete.start(); - let end = current.end() - complete.start() + 1; - let ustart = usize::try_from(start) - .expect("Current range is narrower than bounding range"); - let uend = usize::try_from(end) - .expect("Current range is narrower than bounding range"); - ustart..uend - }) - .collect::>>(); - - self.cells.slice(slices) - } -} - -impl Debug for DenseWriteValueTree { - fn fmt(&self, f: &mut Formatter) -> FmtResult { - let json = json!({ - "layout": self.layout, - "field_order": self.field_order, - "bounding_subarray": self.bounding_subarray, - "subarray": self.subarray_current(), - "prev_shrink": self.prev_shrink - }); - write!(f, "{}", json) - } -} - -impl ValueTree for DenseWriteValueTree { - type Value = DenseWriteInput; - - fn current(&self) -> Self::Value { - let subarray = self.subarray_current(); - let cells = self.cells_for_subarray(&subarray); - - DenseWriteInput { - layout: self.layout, - data: cells.into_inner(), - subarray, - } - } - - fn simplify(&mut self) -> bool { - // try shrinking each dimension in round-robin order, - // beginning with the dimension after whichever we - // previously shrunk - let start = self.prev_shrink.map(|d| d + 1).unwrap_or(0); - - for i in 0..self.subarray.len() { - let idx = (start + i) % self.subarray.len(); - if self.subarray[idx].simplify() { - self.prev_shrink = Some(idx); - return true; - } - } - - self.prev_shrink = None; - false - } - - fn complicate(&mut self) -> bool { - // complicate whichever dimension we previously simplified - if let Some(d) = self.prev_shrink { - if self.subarray[d].complicate() { - // we may be able to complicate again, keep prev_shrink - true - } else { - self.prev_shrink = None; - false - } - } else { - false - } - } -} - -#[derive(Debug)] -pub struct DenseWriteStrategy { - schema: Rc, - layout: CellOrder, - params: DenseWriteParameters, -} - -impl DenseWriteStrategy { - pub fn new( - schema: Rc, - layout: CellOrder, - params: DenseWriteParameters, - ) -> Self { - DenseWriteStrategy { - schema, - layout, - params, - } - } -} - -impl Strategy for DenseWriteStrategy { - type Tree = DenseWriteValueTree; - type Value = DenseWriteInput; - - fn new_tree(&self, runner: &mut TestRunner) -> NewTree { - /* - * For simplicity, we will bound the memory used at each dimension - * rather than keeping a moving product of the accumulated memory - */ - let memory_limit = - { self.params.memory_limit / self.schema.domain.dimension.len() }; - - if matches!(self.layout, CellOrder::Global) { - // necessary to align to tile boundaries - unimplemented!() - } - - let est_cell_size: usize = self - .schema - .fields() - .map(|field| { - match field.cell_val_num().unwrap_or(CellValNum::single()) { - CellValNum::Fixed(nz) => { - /* exact */ - nz.get() as usize * field.datatype().size() as usize - } - CellValNum::Var => { - /* estimate */ - let params = - ::default(); - let est_nvalues = (params.value_min_var_size - + params.value_max_var_size) - / 2; - est_nvalues * field.datatype().size() as usize - } - } - }) - .sum(); - - let cell_limit: usize = memory_limit / est_cell_size; - - /* choose maximal subarray for the write, we will shrink within this window */ - let strat_subarray_bounds = self - .schema - .domain - .dimension - .iter() - .map(|d| { - d.subarray_strategy(Some(cell_limit)).expect("Dense dimension subarray not found") - .prop_map(|r| { - let Range::Single(s) = r else { - unreachable!("Dense dimension subarray is not `Range::Single`: {:?}", r) - }; - s - }).boxed() - }) - .collect::>>(); - - let bounding_subarray = strat_subarray_bounds - .into_iter() - .map(|strat| strat.new_tree(runner).unwrap().current()) - .collect::>(); - - /* prepare tree for each subarray dimension */ - let strat_subarray = bounding_subarray - .iter() - .cloned() - .map(|dim| { - single_value_range_go!( - dim, - _DT: Integral, - start, - end, - { - (start..=end) - .prop_flat_map(move |lower| { - (Just(lower), lower..=end).prop_map( - move |(lower, upper)| { - SingleValueRange::from(&[lower, upper]) - }, - ) - }) - .boxed() - }, - unreachable!() - ) - }) - .collect::>>(); - - let mut subarray: Vec> = vec![]; - for range in strat_subarray { - subarray.push(range.new_tree(runner).unwrap()); - } - - let cells = { - let ncells = bounding_subarray - .iter() - .map(|range| { - usize::try_from(range.num_cells().unwrap()) - .expect("Too many cells to fit in memory") - }) - .product(); - assert!(ncells > 0); - let params = CellsParameters { - schema: Some(CellsStrategySchema::WriteSchema(Rc::clone( - &self.schema, - ))), - min_records: ncells, - max_records: ncells, - ..Default::default() - }; - any_with::(params).new_tree(runner)?.current() - }; - - Ok(DenseWriteValueTree::new( - self.layout, - bounding_subarray, - subarray, - cells, - )) - } -} - -impl Arbitrary for DenseWriteInput { - type Parameters = DenseWriteParameters; - type Strategy = BoxedStrategy; - - fn arbitrary_with(args: Self::Parameters) -> Self::Strategy { - let mut args = args; - let strat_schema = match args.schema.take() { - None => { - let schema_req = - query_write_schema_requirements(Some(ArrayType::Dense)); - any_with::(Rc::new(schema_req)) - .prop_map(Rc::new) - .boxed() - } - Some(schema) => Just(schema).boxed(), - }; - let strat_layout = match args.layout.take() { - None => prop_oneof![ - Just(CellOrder::RowMajor), - Just(CellOrder::ColumnMajor), - /* TODO: CellOrder::Global is possible but has more constraints */ - ].boxed(), - Some(layout) => Just(layout).boxed() - }; - - (strat_schema, strat_layout) - .prop_flat_map(move |(schema, layout)| { - DenseWriteStrategy::new(schema, layout, args.clone()) - }) - .boxed() - } -} - -pub type SparseWriteParameters = DenseWriteParameters; // TODO: determine if this should be different - -#[derive(Clone, Debug)] -pub struct SparseWriteInput { - pub dimensions: Vec<(String, CellValNum)>, - pub data: Cells, -} - -impl SparseWriteInput { - pub fn from_schema_and_data(schema: &SchemaData, data: Cells) -> Self { - let dimensions = schema - .domain - .dimension - .iter() - .map(|d| (d.name.clone(), d.cell_val_num())) - .collect::>(); - SparseWriteInput { dimensions, data } - } - - /// Returns the minimum bounding rectangle containing all - /// the coordinates of this write operation. - pub fn domain(&self) -> Option { - self.dimensions - .iter() - .map(|(dim, cell_val_num)| { - let dim_cells = self.data.fields().get(dim).unwrap(); - Some(typed_field_data_go!( - dim_cells, - _DT, - ref dim_cells, - { - let min = - *dim_cells.iter().min_by(|l, r| l.bits_cmp(r))?; - let max = - *dim_cells.iter().max_by(|l, r| l.bits_cmp(r))?; - Range::from(&[min, max]) - }, - { - let min = dim_cells - .iter() - .min_by(|l, r| l.bits_cmp(r))? - .clone() - .into_boxed_slice(); - let max = dim_cells - .iter() - .max_by(|l, r| l.bits_cmp(r))? - .clone() - .into_boxed_slice(); - match cell_val_num { - CellValNum::Fixed(_) => { - Range::try_from((*cell_val_num, min, max)) - .unwrap() - } - CellValNum::Var => Range::from((min, max)), - } - } - )) - }) - .collect::>() - } - - /// Prepares a write query to insert data from this write operation. - pub fn attach_write<'data>( - &'data self, - b: WriteBuilder<'data>, - ) -> TileDBResult> { - self.data.attach_write(b) - } - - /// Prepares a read query to read the fields written by this operation. - pub fn attach_read<'data, B>( - &'data self, - b: B, - ) -> TileDBResult< - CallbackVarArgReadBuilder< - 'data, - MapAdapter, - B, - >, - > - where - B: ReadQueryBuilder<'data>, - { - Ok(self.data.attach_read(b)?.map(CellsConstructor::new())) - } - - /// Sort the data cells using the dimensions as sort keys, in order. - pub fn sort_cells(&mut self) { - let keys = self - .dimensions - .iter() - .map(|(k, _)| k.clone()) - .collect::>(); - self.data.sort(&keys) - } -} - -impl Arbitrary for SparseWriteInput { - type Parameters = SparseWriteParameters; - type Strategy = BoxedStrategy; - - fn arbitrary_with(params: Self::Parameters) -> Self::Strategy { - if let Some(schema) = params.schema.as_ref() { - let schema = Rc::clone(schema); - let cells_params = CellsParameters { - schema: Some(CellsStrategySchema::WriteSchema(Rc::clone( - &schema, - ))), - ..Default::default() - }; - any_with::(cells_params) - .prop_map(move |data| { - let dimensions = schema - .domain - .dimension - .iter() - .map(|d| (d.name.clone(), d.cell_val_num())) - .collect::>(); - SparseWriteInput { dimensions, data } - }) - .boxed() - } else { - any::() - .prop_flat_map(|data| { - (0..data.fields().len(), Just(data)).prop_map( - |(ndim, data)| SparseWriteInput { - dimensions: data - .fields() - .iter() - .take(ndim) - .map(|(fname, fdata)| { - ( - fname.clone(), - if fdata.is_cell_single() { - CellValNum::single() - } else { - CellValNum::Var - }, - ) - }) - .collect::>(), - data, - }, - ) - }) - .boxed() - } - } -} - -#[derive(Debug)] -pub struct DenseWriteSequence { - writes: Vec, -} - -impl DenseWriteSequence { - pub fn iter_mut(&mut self) -> impl Iterator { - self.writes.iter_mut() - } -} - -impl Deref for DenseWriteSequence { - type Target = Vec; - fn deref(&self) -> &Self::Target { - &self.writes - } -} - -impl Arbitrary for DenseWriteSequence { - type Parameters = DenseWriteSequenceParameters; - type Strategy = BoxedStrategy; - - fn arbitrary_with(params: Self::Parameters) -> Self::Strategy { - fn prop_write_sequence( - schema: Rc, - seq_params: DenseWriteSequenceParameters, - ) -> BoxedStrategy { - let write_params = DenseWriteParameters { - schema: Some(schema), - ..seq_params.write.as_ref().clone() - }; - proptest::collection::vec( - any_with::(write_params), - seq_params.min_writes..=seq_params.max_writes, - ) - .prop_map(|writes| DenseWriteSequence { writes }) - .boxed() - } - - if let Some(schema) = params.write.schema.as_ref() { - prop_write_sequence(Rc::clone(schema), params) - } else { - any::() - .prop_flat_map(move |schema| { - prop_write_sequence(Rc::new(schema), params.clone()) - }) - .boxed() - } - } -} - -impl From for DenseWriteSequence -where - T: Into>, -{ - fn from(value: T) -> Self { - DenseWriteSequence { - writes: value.into(), - } - } -} - -impl IntoIterator for DenseWriteSequence { - type Item = DenseWriteInput; - type IntoIter = as IntoIterator>::IntoIter; - - fn into_iter(self) -> Self::IntoIter { - self.writes.into_iter() - } -} - -impl FromIterator for DenseWriteSequence { - fn from_iter(iter: T) -> Self - where - T: IntoIterator, - { - DenseWriteSequence { - writes: iter.into_iter().collect::>(), - } - } -} - -#[derive(Debug)] -pub struct SparseWriteSequence { - writes: Vec, -} - -impl SparseWriteSequence { - pub fn iter_mut(&mut self) -> impl Iterator { - self.writes.iter_mut() - } -} - -impl Deref for SparseWriteSequence { - type Target = Vec; - fn deref(&self) -> &Self::Target { - &self.writes - } -} - -impl Arbitrary for SparseWriteSequence { - type Parameters = SparseWriteSequenceParameters; - type Strategy = BoxedStrategy; - - fn arbitrary_with(params: Self::Parameters) -> Self::Strategy { - pub fn prop_write_sequence( - schema: Rc, - seq_params: SparseWriteSequenceParameters, - ) -> impl Strategy { - let write_params = SparseWriteParameters { - schema: Some(schema), - ..seq_params.write.as_ref().clone() - }; - proptest::collection::vec( - any_with::(write_params), - seq_params.min_writes..=seq_params.max_writes, - ) - .prop_map(|writes| SparseWriteSequence { writes }) - } - - if let Some(schema) = params.write.schema.as_ref() { - prop_write_sequence(Rc::clone(schema), params).boxed() - } else { - any::() - .prop_flat_map(move |schema| { - prop_write_sequence(Rc::new(schema), params.clone()) - }) - .boxed() - } - } -} - -impl From for SparseWriteSequence -where - T: Into>, -{ - fn from(value: T) -> Self { - SparseWriteSequence { - writes: value.into(), - } - } -} - -impl IntoIterator for SparseWriteSequence { - type Item = SparseWriteInput; - type IntoIter = as IntoIterator>::IntoIter; - - fn into_iter(self) -> Self::IntoIter { - self.writes.into_iter() - } -} - -impl FromIterator for SparseWriteSequence { - fn from_iter(iter: T) -> Self - where - T: IntoIterator, - { - SparseWriteSequence { - writes: iter.into_iter().collect::>(), - } - } -} - -#[derive(Clone, Debug)] -pub enum WriteInput { - Dense(DenseWriteInput), - Sparse(SparseWriteInput), -} - -impl WriteInput { - /// Returns a reference to the cells of input of this write operation. - pub fn cells(&self) -> &Cells { - match self { - Self::Dense(ref dense) => &dense.data, - Self::Sparse(ref sparse) => &sparse.data, - } - } - - /// Returns a mutable reference to the cells of input of this write operation. - pub fn cells_mut(&mut self) -> &mut Cells { - match self { - Self::Dense(ref mut dense) => &mut dense.data, - Self::Sparse(ref mut sparse) => &mut sparse.data, - } - } - - /// Returns the minimum bounding rectangle containing - /// the coordinates of this write operation. - pub fn domain(&self) -> Option { - match self { - Self::Dense(ref dense) => Some( - dense - .subarray - .clone() - .into_iter() - .map(Range::from) - .collect::(), - ), - Self::Sparse(ref sparse) => sparse.domain(), - } - } - - /// Returns the subarray for this write operation, - /// if it is a dense write. Returns `None` otherwise. - pub fn subarray(&self) -> Option { - if let Self::Dense(_) = self { - self.domain() - } else { - None - } - } - - /// Consumes `self` and returns the underlying test data. - pub fn unwrap_cells(self) -> Cells { - match self { - Self::Dense(dense) => dense.data, - Self::Sparse(sparse) => sparse.data, - } - } - - /// Prepares a write queryto insert data from this write operation. - pub fn attach_write<'data>( - &'data self, - b: WriteBuilder<'data>, - ) -> TileDBResult> { - match self { - Self::Dense(ref d) => d.attach_write(b), - Self::Sparse(ref s) => s.attach_write(b), - } - } - - /// Prepares a read query to read the fields written by this operation. - pub fn attach_read<'data, B>( - &'data self, - b: B, - ) -> TileDBResult< - CallbackVarArgReadBuilder< - 'data, - MapAdapter, - B, - >, - > - where - B: ReadQueryBuilder<'data>, - { - match self { - Self::Dense(ref d) => d.attach_read(b), - Self::Sparse(ref s) => s.attach_read(b), - } - } -} - -pub enum WriteInputRef<'a> { - Dense(&'a DenseWriteInput), - Sparse(&'a SparseWriteInput), -} - -impl WriteInputRef<'_> { - /// Returns a reference to the cells of input of this write operation. - pub fn cells(&self) -> &Cells { - match self { - Self::Dense(dense) => &dense.data, - Self::Sparse(sparse) => &sparse.data, - } - } - - pub fn cloned(&self) -> WriteInput { - match self { - Self::Dense(dense) => WriteInput::Dense((*dense).clone()), - Self::Sparse(sparse) => WriteInput::Sparse((*sparse).clone()), - } - } - - /// Returns the minimum bounding rectangle containing - /// the coordinates of this write operation. - pub fn domain(&self) -> Option { - match self { - Self::Dense(dense) => Some( - dense - .subarray - .clone() - .into_iter() - .map(Range::from) - .collect::(), - ), - Self::Sparse(sparse) => sparse.domain(), - } - } - - /// Returns the subarray for this write operation, - /// if it is a dense write. Returns `None` otherwise. - pub fn subarray(&self) -> Option { - if let Self::Dense(_) = self { - self.domain() - } else { - None - } - } - - /// Prepares a write queryto insert data from this write operation. - pub fn attach_write<'data>( - &'data self, - b: WriteBuilder<'data>, - ) -> TileDBResult> { - match self { - Self::Dense(d) => d.attach_write(b), - Self::Sparse(s) => s.attach_write(b), - } - } - - /// Prepares a read query to read the fields written by this operation. - pub fn attach_read<'data, B>( - &'data self, - b: B, - ) -> TileDBResult< - CallbackVarArgReadBuilder< - 'data, - MapAdapter, - B, - >, - > - where - B: ReadQueryBuilder<'data>, - { - match self { - Self::Dense(d) => d.attach_read(b), - Self::Sparse(s) => s.attach_read(b), - } - } -} - -#[derive(Debug)] -pub enum WriteParameters { - Dense(DenseWriteParameters), - Sparse(SparseWriteParameters), -} - -impl WriteParameters { - pub fn default_for(schema: Rc) -> Self { - match schema.array_type { - ArrayType::Dense => Self::Dense(DenseWriteParameters { - schema: Some(schema), - ..Default::default() - }), - ArrayType::Sparse => Self::Sparse(SparseWriteParameters { - schema: Some(schema), - ..Default::default() - }), - } - } -} - -impl Default for WriteParameters { - fn default() -> Self { - Self::Dense(DenseWriteParameters::default()) - } -} - -impl Arbitrary for WriteInput { - type Parameters = WriteParameters; - type Strategy = BoxedStrategy; - - fn arbitrary_with(params: Self::Parameters) -> Self::Strategy { - match params { - WriteParameters::Dense(d) => any_with::(d) - .prop_map(WriteInput::Dense) - .boxed(), - WriteParameters::Sparse(s) => any_with::(s) - .prop_map(WriteInput::Sparse) - .boxed(), - } - } -} - -#[derive(Clone, Debug)] -pub struct WriteSequenceParametersImpl { - pub write: Rc, - pub min_writes: usize, - pub max_writes: usize, -} - -pub type DenseWriteSequenceParameters = - WriteSequenceParametersImpl; -pub type SparseWriteSequenceParameters = - WriteSequenceParametersImpl; - -impl WriteSequenceParametersImpl { - pub fn min_writes_default() -> usize { - **crate::strategy::config::TILEDB_STRATEGY_WRITE_SEQUENCE_PARAMETERS_MIN_WRITES - } - - pub fn max_writes_default() -> usize { - **crate::strategy::config::TILEDB_STRATEGY_WRITE_SEQUENCE_PARAMETERS_MAX_WRITES - } -} - -impl Default for WriteSequenceParametersImpl -where - W: Default, -{ - fn default() -> Self { - WriteSequenceParametersImpl { - write: Rc::new(Default::default()), - min_writes: Self::min_writes_default(), - max_writes: Self::max_writes_default(), - } - } -} - -#[derive(Debug)] -pub enum WriteSequenceParameters { - Dense(DenseWriteSequenceParameters), - Sparse(SparseWriteSequenceParameters), -} - -impl WriteSequenceParameters { - pub fn default_for(schema: Rc) -> Self { - match schema.array_type { - ArrayType::Dense => Self::Dense(DenseWriteSequenceParameters { - write: Rc::new(DenseWriteParameters { - schema: Some(schema), - ..Default::default() - }), - min_writes: DenseWriteSequenceParameters::min_writes_default(), - max_writes: DenseWriteSequenceParameters::max_writes_default(), - }), - ArrayType::Sparse => Self::Sparse(SparseWriteSequenceParameters { - write: Rc::new(SparseWriteParameters { - schema: Some(schema), - ..Default::default() - }), - min_writes: SparseWriteSequenceParameters::min_writes_default(), - max_writes: SparseWriteSequenceParameters::max_writes_default(), - }), - } - } -} - -impl Default for WriteSequenceParameters { - fn default() -> Self { - Self::Dense(Default::default()) - } -} - -#[derive(Debug)] -pub enum WriteSequence { - Dense(DenseWriteSequence), - Sparse(SparseWriteSequence), -} - -impl WriteSequence { - pub fn iter(&self) -> WriteSequenceRefIter { - self.into_iter() - } -} - -impl From for WriteSequence { - fn from(value: WriteInput) -> Self { - match value { - WriteInput::Dense(dense) => Self::Dense(DenseWriteSequence { - writes: vec![dense], - }), - WriteInput::Sparse(sparse) => Self::Sparse(SparseWriteSequence { - writes: vec![sparse], - }), - } - } -} - -impl IntoIterator for WriteSequence { - type Item = WriteInput; - type IntoIter = WriteSequenceIter; - - fn into_iter(self) -> Self::IntoIter { - match self { - Self::Dense(dense) => WriteSequenceIter::Dense(dense.into_iter()), - Self::Sparse(sparse) => { - WriteSequenceIter::Sparse(sparse.into_iter()) - } - } - } -} - -impl<'a> IntoIterator for &'a WriteSequence { - type Item = WriteInputRef<'a>; - type IntoIter = WriteSequenceRefIter<'a>; - - fn into_iter(self) -> Self::IntoIter { - match *self { - WriteSequence::Dense(ref dense) => { - WriteSequenceRefIter::Dense(dense.iter()) - } - WriteSequence::Sparse(ref sparse) => { - WriteSequenceRefIter::Sparse(sparse.iter()) - } - } - } -} - -impl Arbitrary for WriteSequence { - type Parameters = WriteSequenceParameters; - type Strategy = BoxedStrategy; - - fn arbitrary_with(params: Self::Parameters) -> Self::Strategy { - match params { - WriteSequenceParameters::Dense(d) => { - any_with::(d) - .prop_map(Self::Dense) - .boxed() - } - WriteSequenceParameters::Sparse(s) => { - any_with::(s) - .prop_map(Self::Sparse) - .boxed() - } - } - } -} - -pub enum WriteSequenceIter { - Dense(::IntoIter), - Sparse(::IntoIter), -} - -impl Iterator for WriteSequenceIter { - type Item = WriteInput; - - fn next(&mut self) -> Option { - match self { - Self::Dense(ref mut dense) => dense.next().map(WriteInput::Dense), - Self::Sparse(ref mut sparse) => { - sparse.next().map(WriteInput::Sparse) - } - } - } - - fn size_hint(&self) -> (usize, Option) { - match self { - Self::Dense(ref d) => d.size_hint(), - Self::Sparse(ref s) => s.size_hint(), - } - } -} - -pub enum WriteSequenceRefIter<'a> { - Dense(<&'a Vec as IntoIterator>::IntoIter), - Sparse(<&'a Vec as IntoIterator>::IntoIter), -} - -impl<'a> Iterator for WriteSequenceRefIter<'a> { - type Item = WriteInputRef<'a>; - - fn next(&mut self) -> Option { - match self { - Self::Dense(ref mut dense) => { - dense.next().map(WriteInputRef::Dense) - } - Self::Sparse(ref mut sparse) => { - sparse.next().map(WriteInputRef::Sparse) - } - } - } - - fn size_hint(&self) -> (usize, Option) { - match self { - Self::Dense(ref d) => d.size_hint(), - Self::Sparse(ref s) => s.size_hint(), - } - } -} - -#[cfg(test)] -mod tests { - use tiledb_test_utils::{self, TestArrayUri}; - - use super::*; - use crate::array::{Array, ArrayOpener, Mode}; - use crate::error::Error; - use crate::query::{ - Query, QueryBuilder, ReadBuilder, ReadQuery, WriteBuilder, - }; - use crate::{Context, Factory}; - - struct DenseCellsAccumulator { - // TODO: implement accepting more than one write for dense write sequence - write: Option, - } - - impl DenseCellsAccumulator { - pub fn new(_: &SchemaData) -> Self { - DenseCellsAccumulator { write: None } - } - - pub fn cells(&self) -> &Cells { - // will not be called until first cells are written - &self.write.as_ref().unwrap().data - } - - pub fn accumulate(&mut self, write: DenseWriteInput) { - if self.write.is_some() { - unimplemented!() - } - self.write = Some(write) - } - - pub fn attach_read<'data, B>( - &'data self, - b: B, - ) -> TileDBResult< - CallbackVarArgReadBuilder< - 'data, - MapAdapter, - B, - >, - > - where - B: ReadQueryBuilder<'data>, - { - // TODO: this is not correct as we accumulate multiple writes - self.write.as_ref().unwrap().attach_read(b) - } - } - - struct SparseCellsAccumulator { - cells: Option, - dedup_keys: Option>, - } - - impl SparseCellsAccumulator { - pub fn new(schema: &SchemaData) -> Self { - let dedup_keys = if schema.allow_duplicates.unwrap_or(false) { - None - } else { - Some( - schema - .domain - .dimension - .iter() - .map(|d| d.name.clone()) - .collect::>(), - ) - }; - SparseCellsAccumulator { - cells: None, - dedup_keys, - } - } - - pub fn cells(&self) -> &Cells { - // will not be called until first cells arrive - self.cells.as_ref().unwrap() - } - - /// Update state representing what we expect to see in the array. - /// For a sparse array this means adding this write's coordinates, - /// overwriting the old coordinates if they overlap. - pub fn accumulate(&mut self, mut write: SparseWriteInput) { - if let Some(cells) = self.cells.take() { - write.data.extend(cells); - if let Some(dedup_keys) = self.dedup_keys.as_ref() { - self.cells = Some(write.data.dedup(dedup_keys)); - } else { - self.cells = Some(write.data); - } - } else { - self.cells = Some(write.data); - } - } - - pub fn attach_read<'data, B>( - &'data self, - b: B, - ) -> TileDBResult< - CallbackVarArgReadBuilder< - 'data, - MapAdapter, - B, - >, - > - where - B: ReadQueryBuilder<'data>, - { - Ok(self.cells().attach_read(b)?.map(CellsConstructor::new())) - } - } - - enum CellsAccumulator { - Dense(DenseCellsAccumulator), - Sparse(SparseCellsAccumulator), - } - - impl CellsAccumulator { - pub fn new(schema: &SchemaData) -> Self { - match schema.array_type { - ArrayType::Dense => { - Self::Dense(DenseCellsAccumulator::new(schema)) - } - ArrayType::Sparse => { - Self::Sparse(SparseCellsAccumulator::new(schema)) - } - } - } - - pub fn cells(&self) -> &Cells { - match self { - Self::Dense(ref d) => d.cells(), - Self::Sparse(ref s) => s.cells(), - } - } - - pub fn accumulate(&mut self, write: WriteInput) { - match write { - WriteInput::Sparse(w) => { - let Self::Sparse(ref mut sparse) = self else { - unreachable!() - }; - sparse.accumulate(w) - } - WriteInput::Dense(w) => { - let Self::Dense(ref mut dense) = self else { - unreachable!() - }; - dense.accumulate(w) - } - } - } - - pub fn attach_read<'data, B>( - &'data self, - b: B, - ) -> TileDBResult< - CallbackVarArgReadBuilder< - 'data, - MapAdapter, - B, - >, - > - where - B: ReadQueryBuilder<'data>, - { - match self { - Self::Dense(ref d) => d.attach_read(b), - Self::Sparse(ref s) => s.attach_read(b), - } - } - } - - fn do_write_readback( - ctx: &Context, - schema_spec: Rc, - write_sequence: WriteSequence, - ) -> TileDBResult<()> { - let test_uri = tiledb_test_utils::get_uri_generator() - .map_err(|e| Error::Other(e.to_string()))?; - let uri = test_uri - .with_path("array") - .map_err(|e| Error::Other(e.to_string()))?; - - let schema_in = schema_spec - .create(ctx) - .expect("Error constructing arbitrary schema"); - Array::create(ctx, &uri, schema_in).expect("Error creating array"); - - let mut accumulated_domain: Option = None; - let mut accumulated_write = CellsAccumulator::new(&schema_spec); - - /* - * Results do not come back in a defined order, so we must sort and - * compare. Writes currently have to write all fields. - */ - let sort_keys = match write_sequence { - WriteSequence::Dense(_) => schema_spec - .attributes - .iter() - .map(|f| f.name.clone()) - .collect::>(), - WriteSequence::Sparse(_) => schema_spec - .fields() - .map(|f| f.name().to_owned()) - .collect::>(), - }; - - for write in write_sequence { - /* write data and preserve ranges for sanity check */ - let write_ranges = { - let array = Array::open(ctx, &uri, Mode::Write) - .expect("Error opening array"); - - let write_query = write - .attach_write( - WriteBuilder::new(array) - .expect("Error building write query"), - ) - .expect("Error building write query") - .build(); - write_query.submit().expect("Error running write query"); - - let write_ranges = if let Some(ranges) = write.subarray() { - let generic_ranges = ranges - .iter() - .cloned() - .map(|r| vec![r]) - .collect::>>(); - assert_eq!( - generic_ranges, - write_query.subarray().unwrap().ranges().unwrap() - ); - Some(generic_ranges) - } else { - None - }; - - let _ = write_query - .finalize() - .expect("Error finalizing write query"); - - write_ranges - }; - - if write.cells().is_empty() { - // in this case, writing and finalizing does not create a new fragment - // TODO - continue; - } - - /* NB: results are not read back in a defined order, so we must sort and compare */ - - let mut array = ArrayOpener::new(ctx, &uri, Mode::Read) - .unwrap() - .open() - .unwrap(); - - /* - * First check fragment - its domain should match what we just wrote, and we need the - * timestamp so we can read back only this fragment - */ - let [timestamp_min, timestamp_max] = { - let fi = array.fragment_info().unwrap(); - let nf = fi.num_fragments().unwrap(); - assert!(nf > 0); - - let this_fragment = fi.get_fragment(nf - 1).unwrap(); - - if let Some(write_domain) = write.domain() { - let nonempty_domain = - this_fragment.non_empty_domain().unwrap().untyped(); - assert_eq!(write_domain, nonempty_domain); - } else { - // most recent fragment should be empty, - // what does that look like if no data was written? - } - - this_fragment.timestamp_range().unwrap() - }; - - let safety_write_start = std::time::Instant::now(); - - /* - * Then re-open the array to read back what we just wrote - * into the most recent fragment only - */ - { - array = array - .reopen() - .start_timestamp(timestamp_min) - .unwrap() - .end_timestamp(timestamp_max) - .unwrap() - .open() - .unwrap(); - - let mut read = write - .attach_read(ReadBuilder::new(array).unwrap()) - .unwrap() - .build(); - - if let Some(write_ranges) = write_ranges { - let read_ranges = - read.subarray().unwrap().ranges().unwrap(); - assert_eq!(write_ranges, read_ranges); - } - - let (mut cells, _) = read.execute().unwrap(); - - /* `cells` should match the write */ - { - let write_sorted = write.cells().sorted(&sort_keys); - cells.sort(&sort_keys); - assert_eq!(write_sorted, cells); - } - - array = read.finalize().unwrap(); - } - - /* finally, check that everything written up until now is correct */ - array = array.reopen().start_timestamp(0).unwrap().open().unwrap(); - - /* check array non-empty domain */ - if let Some(accumulated_domain) = accumulated_domain.as_mut() { - let Some(write_domain) = write.domain() else { - unreachable!() - }; - *accumulated_domain = accumulated_domain.union(&write_domain); - } else { - accumulated_domain = write.domain(); - } - { - let Some(acc) = accumulated_domain.as_ref() else { - unreachable!() - }; - let nonempty = - array.nonempty_domain().unwrap().unwrap().untyped(); - assert_eq!(*acc, nonempty); - } - - /* update accumulated expected array data */ - accumulated_write.accumulate(write); - { - let acc = accumulated_write.cells().sorted(&sort_keys); - - let cells = { - let mut read = accumulated_write - .attach_read(ReadBuilder::new(array).unwrap()) - .unwrap() - .build(); - - let (mut cells, _) = read.execute().unwrap(); - cells.sort(&sort_keys); - cells - }; - - assert_eq!(acc, cells); - } - - // safety valve to ensure we don't write two fragments in the same millisecond - if safety_write_start.elapsed() - < std::time::Duration::from_millis(1) - { - std::thread::sleep(std::time::Duration::from_millis(1)); - } - } - - Ok(()) - } - - /// Test that a single write can be read back correctly - #[test] - fn write_once_readback() -> TileDBResult<()> { - let ctx = Context::new().expect("Error creating context"); - - let schema_req = query_write_schema_requirements(None); - - let strategy = any_with::(Rc::new(schema_req)) - .prop_flat_map(|schema| { - let schema = Rc::new(schema); - ( - Just(Rc::clone(&schema)), - any_with::(WriteParameters::default_for( - schema, - )) - .prop_map(WriteSequence::from), - ) - }); - - proptest!(|((schema_spec, write_sequence) in strategy)| { - do_write_readback(&ctx, schema_spec, write_sequence)?; - }); - - Ok(()) - } - - /// Test that each write in the sequence can be read back correctly at the right timestamp - #[test] - fn write_sequence_readback() -> TileDBResult<()> { - let ctx = Context::new().expect("Error creating context"); - - let schema_req = - query_write_schema_requirements(Some(ArrayType::Sparse)); - - let strategy = any_with::(Rc::new(schema_req)) - .prop_flat_map(|schema| { - let schema = Rc::new(schema); - ( - Just(Rc::clone(&schema)), - any_with::( - WriteSequenceParameters::default_for(Rc::clone( - &schema, - )), - ), - ) - }); - - proptest!(|((schema_spec, write_sequence) in strategy)| { - do_write_readback(&ctx, schema_spec, write_sequence)?; - }); - - Ok(()) - } -} diff --git a/tiledb/api/src/stats.rs b/tiledb/api/src/stats.rs index d1e02779..e729d686 100644 --- a/tiledb/api/src/stats.rs +++ b/tiledb/api/src/stats.rs @@ -1,7 +1,7 @@ use std::collections::HashMap; use std::ops::Deref; -use anyhow::anyhow; +#[cfg(feature = "serde")] use serde::{Deserialize, Serialize}; use crate::error::Error; @@ -31,7 +31,8 @@ impl Drop for RawStatsString { } } -#[derive(Clone, Debug, Deserialize, PartialEq, Serialize)] +#[derive(Clone, Debug, PartialEq)] +#[cfg_attr(feature = "serde", derive(Deserialize, Serialize))] pub struct Metrics { pub timers: HashMap, pub counters: HashMap, @@ -91,7 +92,10 @@ pub fn dump() -> TileDBResult> { Ok(Some(stats_dump_rust_str)) } +#[cfg(feature = "serde")] pub fn dump_json() -> TileDBResult>> { + use anyhow::anyhow; + if let Some(dump) = dump()? { let datas: Vec = serde_json::from_str::>( dump.as_str(), diff --git a/tiledb/api/src/strategy.rs b/tiledb/api/src/strategy.rs deleted file mode 100644 index 747146be..00000000 --- a/tiledb/api/src/strategy.rs +++ /dev/null @@ -1,126 +0,0 @@ -pub mod config { - use std::ops::Deref; - use std::str::FromStr; - use std::sync::LazyLock; - - fn try_parse_env(env: &str) -> Option - where - T: FromStr, - { - match std::env::var(env) { - Ok(value) => Some( - T::from_str(&value) - .unwrap_or_else(|_| panic!("Invalid value for {}", env)), - ), - Err(_) => None, - } - } - - /// The value of a strategy configuration parameter and its provenance. - pub enum Configuration { - Default(T), - Environmental(T), - } - - impl Configuration { - /// Converts to [Option], returning the wrapped value - /// if this is [Environmental] and [None] otherwise. - pub fn environmental(&self) -> Option - where - T: Copy, - { - match self { - Self::Default(_) => None, - Self::Environmental(value) => Some(*value), - } - } - } - - impl Deref for Configuration { - type Target = T; - - fn deref(&self) -> &Self::Target { - match self { - Self::Default(ref value) => value, - Self::Environmental(ref value) => value, - } - } - } - - macro_rules! config_param { - ($name:ident, $type:ty, $default:expr) => { - pub static $name: LazyLock> = - LazyLock::new(|| { - if let Some(value) = - try_parse_env::<$type>(stringify!($name)) - { - Configuration::Environmental(value) - } else { - Configuration::Default($default) - } - }); - }; - } - - // array/domain/strategy.rs - config_param!(TILEDB_STRATEGY_DOMAIN_PARAMETERS_DIMENSIONS_MIN, usize, 1); - config_param!(TILEDB_STRATEGY_DOMAIN_PARAMETERS_DIMENSIONS_MAX, usize, 8); - config_param!( - TILEDB_STRATEGY_DOMAIN_PARAMETERS_CELLS_PER_TILE_LIMIT, - usize, - 1024 * 32 - ); - - // array/schema/strategy.rs - config_param!(TILEDB_STRATEGY_SCHEMA_PARAMETERS_ATTRIBUTES_MIN, usize, 1); - config_param!(TILEDB_STRATEGY_SCHEMA_PARAMETERS_ATTRIBUTES_MAX, usize, 8); - config_param!( - TILEDB_STRATEGY_SCHEMA_PARAMETERS_SPARSE_TILE_CAPACITY_MIN, - u64, - 1 - ); - config_param!( - TILEDB_STRATEGY_SCHEMA_PARAMETERS_SPARSE_TILE_CAPACITY_MAX, - u64, - crate::array::domain::strategy::Requirements::cells_per_tile_limit_default() as u64 - ); - - // array/enumeration/strategy.rs - config_param!( - TILEDB_STRATEGY_ENUMERATION_PARAMETERS_NUM_VARIANTS_MIN, - usize, - 1 - ); - config_param!( - TILEDB_STRATEGY_ENUMERATION_PARAMETERS_NUM_VARIANTS_MAX, - usize, - 1024 - ); - - // query/strategy.rs - config_param!(TILEDB_STRATEGY_CELLS_PARAMETERS_NUM_RECORDS_MIN, usize, 0); - config_param!(TILEDB_STRATEGY_CELLS_PARAMETERS_NUM_RECORDS_MAX, usize, 16); - config_param!(TILEDB_STRATEGY_CELLS_PARAMETERS_CELL_VAR_SIZE_MIN, usize, 0); - config_param!( - TILEDB_STRATEGY_CELLS_PARAMETERS_CELL_VAR_SIZE_MAX, - usize, - 16 - ); - - // query/write/strategy.rs - config_param!( - TILEDB_STRATEGY_DENSE_WRITE_PARAMETERS_MEMORY_LIMIT, - usize, - 16 * 1024 // chosen arbitrarily; seems small - ); - config_param!( - TILEDB_STRATEGY_WRITE_SEQUENCE_PARAMETERS_MIN_WRITES, - usize, - 1 - ); - config_param!( - TILEDB_STRATEGY_WRITE_SEQUENCE_PARAMETERS_MAX_WRITES, - usize, - 8 - ); -} diff --git a/tiledb/api/src/tests/examples/mod.rs b/tiledb/api/src/tests/examples/mod.rs index c0af60d5..823a17d2 100644 --- a/tiledb/api/src/tests/examples/mod.rs +++ b/tiledb/api/src/tests/examples/mod.rs @@ -2,10 +2,11 @@ use std::rc::Rc; use proptest::prelude::*; use proptest::test_runner::TestRunner; -use tiledb_test_utils::TestArrayUri; +use tiledb_pod::array::schema::SchemaData; +use uri::TestArrayUri; -use crate::array::schema::SchemaData; use crate::error::Error; +use crate::query::ToWriteQuery; use crate::tests::prelude::*; use crate::tests::strategy::prelude::*; use crate::{Context, Factory, Result as TileDBResult}; @@ -22,7 +23,7 @@ pub struct TestArray { impl TestArray { pub fn new(name: &str, schema: Rc) -> TileDBResult { - let test_uri = tiledb_test_utils::get_uri_generator() + let test_uri = uri::get_uri_generator() .map_err(|e| Error::Other(e.to_string()))?; let uri = test_uri .with_path(name) diff --git a/tiledb/api/src/tests/examples/sparse_all.rs b/tiledb/api/src/tests/examples/sparse_all.rs index 3c85cdfe..91ee7a00 100644 --- a/tiledb/api/src/tests/examples/sparse_all.rs +++ b/tiledb/api/src/tests/examples/sparse_all.rs @@ -4,11 +4,14 @@ use std::rc::Rc; -use crate::array::{ - ArrayType, AttributeData, CellValNum, DimensionConstraints, DimensionData, - DomainData, SchemaData, -}; -use crate::{physical_type_go, Datatype}; +use tiledb_common::array::dimension::DimensionConstraints; +use tiledb_common::array::{ArrayType, CellValNum}; +use tiledb_common::datatype::Datatype; +use tiledb_common::physical_type_go; +use tiledb_pod::array::attribute::AttributeData; +use tiledb_pod::array::dimension::DimensionData; +use tiledb_pod::array::domain::DomainData; +use tiledb_pod::array::schema::SchemaData; pub type FnAcceptDimension = dyn Fn(&Parameters, Datatype) -> bool; pub type FnAcceptAttribute = diff --git a/tiledb/api/src/tests/mod.rs b/tiledb/api/src/tests/mod.rs index 5b5ee59d..19035419 100644 --- a/tiledb/api/src/tests/mod.rs +++ b/tiledb/api/src/tests/mod.rs @@ -1,19 +1,21 @@ pub mod examples; pub mod prelude { - pub use crate::array::attribute::{ - AttributeData, Builder as AttributeBuilder, + pub use tiledb_common::array::dimension::DimensionConstraints; + pub use tiledb_common::array::{ + ArrayType, CellOrder, CellValNum, Mode, TileOrder, }; - pub use crate::array::dimension::{ - Builder as DimensionBuilder, DimensionConstraints, DimensionData, - }; - pub use crate::array::domain::{Builder as DomainBuilder, DomainData}; - pub use crate::array::schema::{Builder as SchemaBuilder, SchemaData}; - pub use crate::array::{ - Array, ArrayType, Attribute, CellOrder, CellValNum, Dimension, Domain, - Mode, Schema, TileOrder, - }; - pub use crate::Datatype; + pub use tiledb_common::datatype::Datatype; + pub use tiledb_pod::array::attribute::AttributeData; + pub use tiledb_pod::array::dimension::DimensionData; + pub use tiledb_pod::array::domain::DomainData; + pub use tiledb_pod::array::schema::SchemaData; + + pub use crate::array::attribute::Builder as AttributeBuilder; + pub use crate::array::dimension::Builder as DimensionBuilder; + pub use crate::array::domain::Builder as DomainBuilder; + pub use crate::array::schema::Builder as SchemaBuilder; + pub use crate::array::{Array, Attribute, Dimension, Domain, Schema}; pub use crate::query::{ Query, QueryBuilder, QueryLayout, ReadBuilder, ReadQuery, WriteBuilder, @@ -23,15 +25,16 @@ pub mod prelude { pub use super::examples::TestArray; } +#[cfg(any(test, feature = "proptest-strategies"))] pub mod strategy { pub mod prelude { // NB: this is hardly exhaustive, feel free to add stuff, this is just what has been needed // so far - pub use crate::query::strategy::{Cells, FieldData}; - pub use crate::query::write::strategy::{ - DenseWriteInput, DenseWriteParameters, SparseWriteInput, - SparseWriteParameters, WriteInput, + pub use cells::write::strategy::{ + DenseWriteParameters, SparseWriteParameters, }; + pub use cells::write::{DenseWriteInput, SparseWriteInput, WriteInput}; + pub use cells::{Cells, FieldData}; } } diff --git a/tiledb/api/src/vfs.rs b/tiledb/api/src/vfs.rs index 960d1906..b3662f1b 100644 --- a/tiledb/api/src/vfs.rs +++ b/tiledb/api/src/vfs.rs @@ -1,42 +1,10 @@ use std::ops::Deref; -use serde::{Deserialize, Serialize}; - use crate::config::{Config, RawConfig}; use crate::context::{CApiInterface, Context, ContextBound}; use crate::Result as TileDBResult; -#[derive(Clone, Copy, Debug, Deserialize, Eq, PartialEq, Serialize)] -pub enum VFSMode { - Read, - Write, - Append, -} - -impl VFSMode { - pub(crate) fn capi_enum(&self) -> ffi::tiledb_vfs_mode_t { - match *self { - VFSMode::Read => ffi::tiledb_vfs_mode_t_TILEDB_VFS_READ, - VFSMode::Write => ffi::tiledb_vfs_mode_t_TILEDB_VFS_WRITE, - VFSMode::Append => ffi::tiledb_vfs_mode_t_TILEDB_VFS_APPEND, - } - } -} - -impl TryFrom for VFSMode { - type Error = crate::error::Error; - fn try_from(value: ffi::tiledb_vfs_mode_t) -> TileDBResult { - match value { - ffi::tiledb_vfs_mode_t_TILEDB_VFS_READ => Ok(VFSMode::Read), - ffi::tiledb_vfs_mode_t_TILEDB_VFS_WRITE => Ok(VFSMode::Write), - ffi::tiledb_vfs_mode_t_TILEDB_VFS_APPEND => Ok(VFSMode::Append), - _ => Err(Self::Error::LibTileDB(format!( - "Invalid VFS mode: {}", - value - ))), - } - } -} +pub use tiledb_common::vfs::VFSMode; pub enum VFSLsStatus { Continue, @@ -312,7 +280,7 @@ impl VFS { ctx, c_vfs, c_uri.as_ptr(), - mode.capi_enum(), + ffi::tiledb_vfs_mode_t::from(mode), &mut c_fh, ) })?; @@ -546,7 +514,7 @@ mod tests { // There is no cloud service backend for the VFS so we're not using the // URI generator facilities in these tests. use crate::error::Error; - use tiledb_test_utils::uri_generators::TestDirectory; + use uri::TestDirectory; #[test] fn vfs_alloc() -> TileDBResult<()> { diff --git a/tiledb/common/Cargo.toml b/tiledb/common/Cargo.toml new file mode 100644 index 00000000..1b604ea6 --- /dev/null +++ b/tiledb/common/Cargo.toml @@ -0,0 +1,27 @@ +[package] +name = "tiledb-common" +edition.workspace = true +rust-version.workspace = true +version.workspace = true + +[dependencies] +anyhow = { workspace = true } +arrow-schema = { workspace = true, optional = true } +paste = { workspace = true } +proptest = { workspace = true, optional = true } +serde = { workspace = true, optional = true } +serde_json = { workspace = true, optional = true } +thiserror = { workspace = true } +tiledb-proc-macro = { workspace = true, optional = true } +tiledb-sys = { workspace = true } +tiledb-utils = { workspace = true, optional = true } + +[dev-dependencies] +proptest = { workspace = true } + +[features] +default = [] +arrow = ["dep:arrow-schema"] +option-subset = ["dep:tiledb-proc-macro", "dep:tiledb-utils"] +proptest-strategies = ["dep:proptest"] +serde = ["dep:serde", "dep:serde_json"] diff --git a/tiledb/common/src/array/attribute.rs b/tiledb/common/src/array/attribute.rs new file mode 100644 index 00000000..8eb7e89f --- /dev/null +++ b/tiledb/common/src/array/attribute.rs @@ -0,0 +1,224 @@ +use std::num::NonZeroU32; + +use anyhow::anyhow; +use thiserror::Error; + +use crate::array::CellValNum; +use crate::datatype::PhysicalType; + +/// Trait for data which can be used as a fill value for an attribute. +pub trait IntoFillValue { + type PhysicalType: PhysicalType; + + /// Get a reference to the raw fill value data. + /// The returned slice will be copied into the tiledb core. + fn to_raw(&self) -> &[Self::PhysicalType]; +} + +/// Trait for data which can be constructed from an attribute's raw fill value. +pub trait FromFillValue<'a>: IntoFillValue + Sized { + /// Construct a value of this type from a raw fill value. + fn from_raw( + raw: &'a [Self::PhysicalType], + ) -> Result; +} + +#[derive(Debug, Error)] +pub enum FromFillValueError { + #[error("Unexpected cell structure: expected {0}, found {1}")] + UnexpectedCellStructure(CellValNum, CellValNum), + #[error("Error constructing object: {0}")] + Construction(anyhow::Error), +} + +impl IntoFillValue for T +where + T: PhysicalType, +{ + type PhysicalType = Self; + + fn to_raw(&self) -> &[Self::PhysicalType] { + std::slice::from_ref(self) + } +} + +impl FromFillValue<'_> for T +where + T: PhysicalType, +{ + fn from_raw( + raw: &[Self::PhysicalType], + ) -> Result { + if raw.len() == 1 { + Ok(raw[0]) + } else { + // SAFETY: this is safe when coming from core which forbids zero-length fill values + let found = CellValNum::try_from(raw.len() as u32).unwrap(); + + Err(FromFillValueError::UnexpectedCellStructure( + CellValNum::single(), + found, + )) + } + } +} + +impl IntoFillValue for [T; K] +where + T: PhysicalType, +{ + type PhysicalType = T; + + fn to_raw(&self) -> &[Self::PhysicalType] { + self + } +} + +impl<'a, T, const K: usize> FromFillValue<'a> for [T; K] +where + T: PhysicalType, +{ + fn from_raw( + raw: &'a [Self::PhysicalType], + ) -> Result { + Self::try_from(raw).map_err(|_| { + let expected = { + // SAFETY: there's no way to bound `0 < K < u32::MAX` for a trait impl + // so this can panic, but in a way that's statically known + let nz = + u32::try_from(K).ok().and_then(NonZeroU32::new).expect( + "`impl FillValue for [T; K] requires 0 < K < u32::MAX", + ); + CellValNum::Fixed(nz) + }; + + // SAFETY: this is safe when coming from core which forbids zero-length fill values + let found = CellValNum::try_from(raw.len() as u32).unwrap(); + + FromFillValueError::UnexpectedCellStructure(expected, found) + }) + } +} + +impl IntoFillValue for &[T] +where + T: PhysicalType, +{ + type PhysicalType = T; + + fn to_raw(&self) -> &[Self::PhysicalType] { + self + } +} + +impl<'a, T> FromFillValue<'a> for &'a [T] +where + T: PhysicalType, +{ + fn from_raw( + raw: &'a [Self::PhysicalType], + ) -> Result { + Ok(raw) + } +} + +impl IntoFillValue for Vec +where + T: PhysicalType, +{ + type PhysicalType = T; + + fn to_raw(&self) -> &[Self::PhysicalType] { + self.as_slice() + } +} + +impl FromFillValue<'_> for Vec +where + T: PhysicalType, +{ + fn from_raw( + raw: &[Self::PhysicalType], + ) -> Result { + Ok(raw.to_vec()) + } +} + +impl IntoFillValue for &str { + type PhysicalType = u8; + + fn to_raw(&self) -> &[Self::PhysicalType] { + self.as_bytes() + } +} + +impl<'a> FromFillValue<'a> for &'a str { + fn from_raw( + raw: &'a [Self::PhysicalType], + ) -> Result { + std::str::from_utf8(raw).map_err(|e| { + FromFillValueError::Construction(anyhow!( + "Non-UTF8 fill value: {}", + e + )) + }) + } +} + +impl IntoFillValue for String { + type PhysicalType = u8; + + fn to_raw(&self) -> &[Self::PhysicalType] { + self.as_bytes() + } +} + +impl<'a> FromFillValue<'a> for String { + fn from_raw( + raw: &'a [Self::PhysicalType], + ) -> Result { + <&'a str as FromFillValue<'a>>::from_raw(raw).map(|s| s.to_string()) + } +} + +#[cfg(test)] +mod tests { + use proptest::collection::vec; + use proptest::prelude::*; + + use super::*; + + fn fill_value_roundtrip(value: T) -> bool + where + T: for<'a> FromFillValue<'a> + PartialEq, + { + let value_out = T::from_raw(value.to_raw()); + if let Ok(value_out) = value_out { + value == value_out + } else { + false + } + } + + proptest! { + #[test] + fn fill_value_roundtrip_u64(value in any::()) { + assert!(fill_value_roundtrip(value)) + } + + #[test] + fn fill_value_roundtrip_array(value in any::<[u64; 32]>()) { + assert!(fill_value_roundtrip(value)); + } + + #[test] + fn fill_value_roundtrip_vec(value in vec(any::(), 0..=64)) { + assert!(fill_value_roundtrip(value)); + } + + #[test] + fn fill_value_roundtrip_str(value in any::()) { + assert!(fill_value_roundtrip(value)); + } + } +} diff --git a/tiledb/common/src/array/dimension.rs b/tiledb/common/src/array/dimension.rs new file mode 100644 index 00000000..82f391fc --- /dev/null +++ b/tiledb/common/src/array/dimension.rs @@ -0,0 +1,255 @@ +use thiserror::Error; + +#[cfg(feature = "option-subset")] +use tiledb_utils::option::OptionSubset; + +#[cfg(feature = "serde")] +use serde::{Deserialize, Serialize}; + +use crate::array::CellValNum; +use crate::datatype::{Datatype, Error as DatatypeError}; +use crate::range::SingleValueRange; + +#[derive(Clone, Debug, Error)] +pub enum Error { + #[error("Invalid datatype: {0}")] + Datatype(#[from] DatatypeError), + #[error("Expected {} but found {0}", Datatype::StringAscii.to_string())] + ExpectedStringAscii(Datatype), +} + +#[derive(Clone, Debug, PartialEq)] +#[cfg_attr(feature = "option-subset", derive(OptionSubset))] +#[cfg_attr(feature = "serde", derive(Deserialize, Serialize))] +pub enum DimensionConstraints { + Int8([i8; 2], Option), + Int16([i16; 2], Option), + Int32([i32; 2], Option), + Int64([i64; 2], Option), + UInt8([u8; 2], Option), + UInt16([u16; 2], Option), + UInt32([u32; 2], Option), + UInt64([u64; 2], Option), + Float32([f32; 2], Option), + Float64([f64; 2], Option), + StringAscii, +} + +#[macro_export] +macro_rules! dimension_constraints_go { + ($expr:expr, $DT:ident, $range:pat, $extent:pat, $then:expr, $string:expr) => {{ + dimension_constraints_go!( + $expr, $DT, $range, $extent, $then, $then, $string + ) + }}; + ($expr:expr, $DT:ident, $range:pat, $extent:pat, $integral:expr, $float:expr, $string:expr) => {{ + use $crate::array::dimension::DimensionConstraints; + match $expr { + #[allow(unused_variables)] + DimensionConstraints::Int8($range, $extent) => { + #[allow(dead_code)] + type $DT = i8; + $integral + } + #[allow(unused_variables)] + DimensionConstraints::Int16($range, $extent) => { + #[allow(dead_code)] + type $DT = i16; + $integral + } + #[allow(unused_variables)] + DimensionConstraints::Int32($range, $extent) => { + #[allow(dead_code)] + type $DT = i32; + $integral + } + #[allow(unused_variables)] + DimensionConstraints::Int64($range, $extent) => { + #[allow(dead_code)] + type $DT = i64; + $integral + } + #[allow(unused_variables)] + DimensionConstraints::UInt8($range, $extent) => { + #[allow(dead_code)] + type $DT = u8; + $integral + } + #[allow(unused_variables)] + DimensionConstraints::UInt16($range, $extent) => { + #[allow(dead_code)] + type $DT = u16; + $integral + } + #[allow(unused_variables)] + DimensionConstraints::UInt32($range, $extent) => { + #[allow(dead_code)] + type $DT = u32; + $integral + } + #[allow(unused_variables)] + DimensionConstraints::UInt64($range, $extent) => { + #[allow(dead_code)] + type $DT = u64; + $integral + } + #[allow(unused_variables)] + DimensionConstraints::Float32($range, $extent) => { + #[allow(dead_code)] + type $DT = f32; + $float + } + #[allow(unused_variables)] + DimensionConstraints::Float64($range, $extent) => { + #[allow(dead_code)] + type $DT = f64; + $float + } + DimensionConstraints::StringAscii => $string, + } + }}; +} + +macro_rules! dimension_constraints_impl { + ($($V:ident : $U:ty),+) => { + $( + impl From<[$U; 2]> for DimensionConstraints { + fn from(value: [$U; 2]) -> DimensionConstraints { + DimensionConstraints::$V(value, None) + } + } + + impl From<&[$U; 2]> for DimensionConstraints { + fn from(value: &[$U; 2]) -> DimensionConstraints { + DimensionConstraints::$V([value[0], value[1]], None) + } + } + + impl From<([$U; 2], $U)> for DimensionConstraints { + fn from(value: ([$U; 2], $U)) -> DimensionConstraints { + DimensionConstraints::$V([value.0[0], value.0[1]], Some(value.1)) + } + } + + impl From<(&[$U; 2], $U)> for DimensionConstraints { + fn from(value: (&[$U; 2], $U)) -> DimensionConstraints { + DimensionConstraints::$V([value.0[0], value.0[1]], Some(value.1)) + } + } + + impl From<([$U; 2], Option<$U>)> for DimensionConstraints { + fn from(value: ([$U; 2], Option<$U>)) -> DimensionConstraints { + DimensionConstraints::$V([value.0[0], value.0[1]], value.1) + } + } + + impl From<(&[$U; 2], Option<$U>)> for DimensionConstraints { + fn from(value: (&[$U; 2], Option<$U>)) -> DimensionConstraints { + DimensionConstraints::$V([value.0[0], value.0[1]], value.1) + } + } + )+ + } +} + +dimension_constraints_impl!(Int8: i8, Int16: i16, Int32: i32, Int64: i64); +dimension_constraints_impl!(UInt8: u8, UInt16: u16, UInt32: u32, UInt64: u64); +dimension_constraints_impl!(Float32: f32, Float64: f64); + +impl DimensionConstraints { + /// Returns a [Datatype] which represents the physical type of this constraint. + pub fn physical_datatype(&self) -> Datatype { + match self { + Self::UInt8(_, _) => Datatype::UInt8, + Self::UInt16(_, _) => Datatype::UInt16, + Self::UInt32(_, _) => Datatype::UInt32, + Self::UInt64(_, _) => Datatype::UInt64, + Self::Int8(_, _) => Datatype::Int8, + Self::Int16(_, _) => Datatype::Int16, + Self::Int32(_, _) => Datatype::Int32, + Self::Int64(_, _) => Datatype::Int64, + Self::Float32(_, _) => Datatype::Float32, + Self::Float64(_, _) => Datatype::Float64, + Self::StringAscii => Datatype::StringAscii, + } + } + + pub fn cell_val_num(&self) -> CellValNum { + match self { + DimensionConstraints::StringAscii => CellValNum::Var, + _ => CellValNum::single(), + } + } + + pub fn verify_type_compatible( + &self, + datatype: Datatype, + ) -> Result<(), Error> { + dimension_constraints_go!( + self, + DT, + _range, + _extent, + { + if !datatype.is_compatible_type::
() { + return Err(Error::Datatype( + DatatypeError::physical_type_incompatible::
( + datatype, + ), + )); + } + }, + { + if !matches!(datatype, Datatype::StringAscii) { + return Err(Error::ExpectedStringAscii(datatype)); + } + } + ); + + Ok(()) + } + + /// Returns the number of cells spanned by this constraint, if applicable + pub fn num_cells(&self) -> Option { + let (low, high) = crate::dimension_constraints_go!( + self, + _DT, + [low, high], + _, + (i128::from(*low), i128::from(*high)), + return None, + return None + ); + + Some(1 + (high - low) as u128) + } + /// Returns the number of cells spanned by a + /// single tile under this constraint, if applicable + pub fn num_cells_per_tile(&self) -> Option { + crate::dimension_constraints_go!( + self, + _DT, + _, + extent, + extent.map(|extent| { + #[allow(clippy::unnecessary_fallible_conversions)] + // this `unwrap` should be safe, validation will confirm nonzero + usize::try_from(extent).unwrap() + }), + None, + None + ) + } + + /// Returns the domain of the dimension constraint, if present, as a range. + pub fn domain(&self) -> Option { + crate::dimension_constraints_go!( + self, + _DT, + [low, high], + _, + Some(SingleValueRange::from(&[*low, *high])), + None + ) + } +} diff --git a/tiledb/common/src/array/mod.rs b/tiledb/common/src/array/mod.rs new file mode 100644 index 00000000..6a7166cb --- /dev/null +++ b/tiledb/common/src/array/mod.rs @@ -0,0 +1,385 @@ +pub mod attribute; +pub mod dimension; + +use std::fmt::{Debug, Display, Formatter, Result as FmtResult}; +use std::num::NonZeroU32; + +use thiserror::Error; + +#[cfg(feature = "option-subset")] +use tiledb_utils::option::OptionSubset; + +#[cfg(feature = "proptest-strategies")] +use proptest::prelude::*; + +#[cfg(feature = "serde")] +use serde::{Deserialize, Serialize}; + +#[derive(Clone, Copy, Debug, PartialEq)] +pub enum Mode { + Read, + Write, + Delete, + Update, + ModifyExclusive, +} + +#[derive(Clone, Debug, Eq, Error, PartialEq)] +pub enum ModeError { + #[error("Invalid discriminant for {}: {0}", std::any::type_name::())] + InvalidDiscriminant(u64), +} + +impl From for ffi::tiledb_query_type_t { + fn from(value: Mode) -> Self { + match value { + Mode::Read => ffi::tiledb_query_type_t_TILEDB_READ, + Mode::Write => ffi::tiledb_query_type_t_TILEDB_WRITE, + Mode::Delete => ffi::tiledb_query_type_t_TILEDB_DELETE, + Mode::Update => ffi::tiledb_query_type_t_TILEDB_UPDATE, + Mode::ModifyExclusive => { + ffi::tiledb_query_type_t_TILEDB_MODIFY_EXCLUSIVE + } + } + } +} + +impl TryFrom for Mode { + type Error = ModeError; + + fn try_from(value: ffi::tiledb_query_type_t) -> Result { + Ok(match value { + ffi::tiledb_query_type_t_TILEDB_READ => Mode::Read, + ffi::tiledb_query_type_t_TILEDB_WRITE => Mode::Write, + ffi::tiledb_query_type_t_TILEDB_DELETE => Mode::Delete, + ffi::tiledb_query_type_t_TILEDB_UPDATE => Mode::Update, + ffi::tiledb_query_type_t_TILEDB_MODIFY_EXCLUSIVE => { + Mode::ModifyExclusive + } + _ => return Err(ModeError::InvalidDiscriminant(value as u64)), + }) + } +} + +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +#[cfg_attr(feature = "option-subset", derive(OptionSubset))] +#[cfg_attr(feature = "serde", derive(Deserialize, Serialize))] +pub enum TileOrder { + RowMajor, + ColumnMajor, +} + +#[derive(Clone, Debug, Eq, Error, PartialEq)] +pub enum TileOrderError { + #[error("Invalid discriminant for {}: {0}", std::any::type_name::())] + InvalidDiscriminant(u64), +} + +impl From for ffi::tiledb_layout_t { + fn from(value: TileOrder) -> Self { + match value { + TileOrder::RowMajor => ffi::tiledb_layout_t_TILEDB_ROW_MAJOR, + TileOrder::ColumnMajor => ffi::tiledb_layout_t_TILEDB_COL_MAJOR, + } + } +} + +impl TryFrom for TileOrder { + type Error = TileOrderError; + fn try_from(value: ffi::tiledb_layout_t) -> Result { + match value { + ffi::tiledb_layout_t_TILEDB_ROW_MAJOR => Ok(TileOrder::RowMajor), + ffi::tiledb_layout_t_TILEDB_COL_MAJOR => Ok(TileOrder::ColumnMajor), + _ => Err(TileOrderError::InvalidDiscriminant(value as u64)), + } + } +} + +#[cfg(feature = "proptest-strategies")] +impl Arbitrary for TileOrder { + type Parameters = (); + type Strategy = BoxedStrategy; + + fn arbitrary_with(_: Self::Parameters) -> Self::Strategy { + prop_oneof![Just(TileOrder::RowMajor), Just(TileOrder::ColumnMajor)] + .boxed() + } +} + +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +#[cfg_attr(feature = "option-subset", derive(OptionSubset))] +#[cfg_attr(feature = "serde", derive(Deserialize, Serialize))] +pub enum CellOrder { + Unordered, + RowMajor, + ColumnMajor, + Global, + Hilbert, +} + +#[derive(Clone, Debug, Eq, Error, PartialEq)] +pub enum CellOrderError { + #[error("Invalid discriminant for {}: {0}", std::any::type_name::())] + InvalidDiscriminant(u64), +} + +impl From for ffi::tiledb_layout_t { + fn from(value: CellOrder) -> Self { + match value { + CellOrder::Unordered => ffi::tiledb_layout_t_TILEDB_UNORDERED, + CellOrder::RowMajor => ffi::tiledb_layout_t_TILEDB_ROW_MAJOR, + CellOrder::ColumnMajor => ffi::tiledb_layout_t_TILEDB_COL_MAJOR, + CellOrder::Global => ffi::tiledb_layout_t_TILEDB_GLOBAL_ORDER, + CellOrder::Hilbert => ffi::tiledb_layout_t_TILEDB_HILBERT, + } + } +} + +impl TryFrom for CellOrder { + type Error = CellOrderError; + fn try_from(value: ffi::tiledb_layout_t) -> Result { + match value { + ffi::tiledb_layout_t_TILEDB_UNORDERED => Ok(CellOrder::Unordered), + ffi::tiledb_layout_t_TILEDB_ROW_MAJOR => Ok(CellOrder::RowMajor), + ffi::tiledb_layout_t_TILEDB_COL_MAJOR => Ok(CellOrder::ColumnMajor), + ffi::tiledb_layout_t_TILEDB_GLOBAL_ORDER => Ok(CellOrder::Global), + ffi::tiledb_layout_t_TILEDB_HILBERT => Ok(CellOrder::Hilbert), + _ => Err(CellOrderError::InvalidDiscriminant(value as u64)), + } + } +} + +#[cfg(feature = "proptest-strategies")] +impl Arbitrary for CellOrder { + type Strategy = BoxedStrategy; + type Parameters = Option; + + fn arbitrary_with(args: Self::Parameters) -> Self::Strategy { + match args { + None => prop_oneof![ + Just(CellOrder::Unordered), + Just(CellOrder::RowMajor), + Just(CellOrder::ColumnMajor), + Just(CellOrder::Hilbert), + ] + .boxed(), + Some(ArrayType::Sparse) => prop_oneof![ + Just(CellOrder::RowMajor), + Just(CellOrder::ColumnMajor), + Just(CellOrder::Hilbert), + ] + .boxed(), + Some(ArrayType::Dense) => prop_oneof![ + Just(CellOrder::RowMajor), + Just(CellOrder::ColumnMajor), + ] + .boxed(), + } + } +} + +#[derive(Clone, Copy, Debug, Default, Eq, PartialEq)] +#[cfg_attr(feature = "option-subset", derive(OptionSubset))] +#[cfg_attr(feature = "serde", derive(Deserialize, Serialize))] +pub enum ArrayType { + #[default] + Dense, + Sparse, +} + +#[derive(Clone, Debug, Eq, Error, PartialEq)] +pub enum ArrayTypeError { + #[error("Invalid discriminant for {}: {0}", std::any::type_name::())] + InvalidDiscriminant(u64), +} + +impl From for ffi::tiledb_array_type_t { + fn from(value: ArrayType) -> Self { + match value { + ArrayType::Dense => ffi::tiledb_array_type_t_TILEDB_DENSE, + ArrayType::Sparse => ffi::tiledb_array_type_t_TILEDB_SPARSE, + } + } +} + +impl TryFrom for ArrayType { + type Error = ArrayTypeError; + fn try_from(value: ffi::tiledb_array_type_t) -> Result { + match value { + ffi::tiledb_array_type_t_TILEDB_DENSE => Ok(ArrayType::Dense), + ffi::tiledb_array_type_t_TILEDB_SPARSE => Ok(ArrayType::Sparse), + _ => Err(ArrayTypeError::InvalidDiscriminant(value as u64)), + } + } +} + +#[cfg(feature = "proptest-strategies")] +impl Arbitrary for ArrayType { + type Parameters = (); + type Strategy = BoxedStrategy; + + fn arbitrary_with(_: Self::Parameters) -> Self::Strategy { + prop_oneof![Just(ArrayType::Dense), Just(ArrayType::Sparse)].boxed() + } +} + +/// Represents the number of values carried within a single cell of an attribute or dimension. +#[derive(Copy, Clone, Debug, Eq, PartialEq)] +#[cfg_attr(feature = "option-subset", derive(OptionSubset))] +#[cfg_attr(feature = "serde", derive(Deserialize, Serialize))] +pub enum CellValNum { + /// The number of values per cell is a specific fixed number. + Fixed(std::num::NonZeroU32), + /// The number of values per cell varies. + /// When this option is used for a dimension or attribute, queries must allocate additional + /// space to hold structural information about each cell. The values will be concatenated + /// together in a single buffer, and the structural data buffer contains the offset + /// of each record into the values buffer. + Var, +} + +impl CellValNum { + pub fn single() -> Self { + CellValNum::Fixed(NonZeroU32::new(1).unwrap()) + } + + pub fn is_var_sized(&self) -> bool { + matches!(self, CellValNum::Var) + } + + pub fn is_single_valued(&self) -> bool { + matches!(self, CellValNum::Fixed(nz) if nz.get() == 1) + } + + /// Return the fixed number of values per cell, if not variable. + pub fn fixed(&self) -> Option { + if let CellValNum::Fixed(nz) = self { + Some(*nz) + } else { + None + } + } +} + +#[derive(Clone, Debug, Eq, Error, PartialEq)] +pub enum CellValNumError { + #[error("{} cannot be zero", std::any::type_name::())] + CannotBeZero, +} + +impl Default for CellValNum { + fn default() -> Self { + Self::single() + } +} + +impl PartialEq for CellValNum { + fn eq(&self, other: &u32) -> bool { + match self { + CellValNum::Fixed(val) => val.get() == *other, + CellValNum::Var => *other == u32::MAX, + } + } +} + +impl Display for CellValNum { + fn fmt(&self, f: &mut Formatter) -> FmtResult { + Debug::fmt(self, f) + } +} + +impl TryFrom for CellValNum { + type Error = CellValNumError; + fn try_from(value: u32) -> Result { + match value { + 0 => Err(CellValNumError::CannotBeZero), + u32::MAX => Ok(CellValNum::Var), + v => Ok(CellValNum::Fixed(NonZeroU32::new(v).unwrap())), + } + } +} + +impl From for u32 { + fn from(value: CellValNum) -> Self { + match value { + CellValNum::Fixed(nz) => nz.get(), + CellValNum::Var => u32::MAX, + } + } +} + +#[cfg(feature = "proptest-strategies")] +impl Arbitrary for CellValNum { + type Strategy = BoxedStrategy; + type Parameters = Option>; + + fn arbitrary_with(r: Self::Parameters) -> Self::Strategy { + if let Some(range) = r { + (range.start.get()..range.end.get()) + .prop_map(|nz| CellValNum::try_from(nz).unwrap()) + .boxed() + } else { + prop_oneof![ + 30 => Just(CellValNum::single()), + 30 => Just(CellValNum::Var), + 25 => (2u32..=8).prop_map(|nz| CellValNum::try_from(nz).unwrap()), + 10 => (9u32..=16).prop_map(|nz| CellValNum::try_from(nz).unwrap()), + 3 => (17u32..=32).prop_map(|nz| CellValNum::try_from(nz).unwrap()), + 2 => (33u32..=64).prop_map(|nz| CellValNum::try_from(nz).unwrap()), + // NB: large fixed CellValNums don't really reflect production use cases + // and are not well tested, and are known to cause problems + ].boxed() + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn ffi_mode() { + for m in [Mode::Read, Mode::Write, Mode::Delete, Mode::Update] { + assert_eq!( + m, + Mode::try_from(ffi::tiledb_query_type_t::from(m)).unwrap() + ); + } + } + + #[test] + fn ffi_tile_order() { + for t in [TileOrder::RowMajor, TileOrder::ColumnMajor] { + assert_eq!( + t, + TileOrder::try_from(ffi::tiledb_layout_t::from(t)).unwrap() + ); + } + } + + #[test] + fn ffi_cell_order() { + for c in [ + CellOrder::Unordered, + CellOrder::RowMajor, + CellOrder::ColumnMajor, + CellOrder::Global, + CellOrder::Hilbert, + ] { + assert_eq!( + c, + CellOrder::try_from(ffi::tiledb_layout_t::from(c)).unwrap() + ); + } + } + + #[test] + fn ffi_array_type() { + for a in [ArrayType::Dense, ArrayType::Sparse] { + assert_eq!( + a, + ArrayType::try_from(ffi::tiledb_array_type_t::from(a)).unwrap() + ); + } + } +} diff --git a/tiledb/common/src/datatype/arrow.rs b/tiledb/common/src/datatype/arrow.rs new file mode 100644 index 00000000..cc04f95c --- /dev/null +++ b/tiledb/common/src/datatype/arrow.rs @@ -0,0 +1,1074 @@ +use std::collections::HashMap; +use std::num::NonZeroU32; +use std::str::FromStr; +use std::sync::Arc; + +use arrow_schema::{Field, TimeUnit}; + +use super::Datatype; +use crate::array::CellValNum; + +/// Represents tiledb (`Datatype`, `CellValNum`) compatibility for an arrow `DataType`. +#[derive(Clone, Debug, Eq, PartialEq)] +pub enum DatatypeToArrowResult { + /// There is an exact logical match for the tiledb `Datatype`. + /// The individual values of the respective types have the same bit width + /// and are meant to be interpreted the same way. + /// + /// In general, this means that: + /// 1. `CellValNum::Fixed(1)` maps to an arrow primitive or date/time type. + /// ``` + /// use tiledb_common::array::CellValNum; + /// use tiledb_common::datatype::Datatype as TileDB; + /// use tiledb_common::datatype::arrow::{to_arrow, DatatypeToArrowResult}; + /// assert_eq!(DatatypeToArrowResult::Exact(arrow_schema::DataType::UInt8), + /// to_arrow(&TileDB::UInt8, CellValNum::single())); + /// ``` + /// 2. `CellValNum::Fixed(n) if n > 1` 1 maps to an arrow fixed size list. + /// ``` + /// use arrow_schema::DataType as Arrow; + /// use tiledb_common::array::CellValNum; + /// use tiledb_common::datatype::Datatype as TileDB; + /// use tiledb_common::datatype::arrow::{to_arrow, DatatypeToArrowResult}; + /// + /// let arrow = to_arrow(&TileDB::UInt8, CellValNum::try_from(8).unwrap()); + /// let DatatypeToArrowResult::Exact(Arrow::FixedSizeList(item, fixed_len)) = arrow else { unreachable!() }; + /// assert_eq!(*item.data_type(), Arrow::UInt8); + /// assert_eq!(fixed_len, 8); + /// ``` + /// 3. `CellValNum::Var` maps to an arrow `LargeList`. + /// ``` + /// use arrow_schema::DataType as Arrow; + /// use tiledb_common::array::CellValNum; + /// use tiledb_common::datatype::Datatype as TileDB; + /// use tiledb_common::datatype::arrow::{to_arrow, DatatypeToArrowResult}; + /// + /// let arrow = to_arrow(&TileDB::UInt8, CellValNum::Var); + /// let DatatypeToArrowResult::Exact(Arrow::LargeList(item)) = arrow else { unreachable!() }; + /// assert_eq!(*item.data_type(), Arrow::UInt8); + /// ``` + /// + /// There are some exceptions, such as `(Datatype::Blob, CellValNum::Var)` + /// mapping to `arrow_schema::DataType::LargeBinary`, which is always variable-length. + /// + /// ``` + /// use tiledb_common::array::CellValNum; + /// use tiledb_common::datatype::Datatype as TileDB; + /// use tiledb_common::datatype::arrow::{to_arrow, DatatypeToArrowResult}; + /// + /// assert_eq!(DatatypeToArrowResult::Exact(arrow_schema::DataType::LargeBinary), + /// to_arrow(&TileDB::Blob, CellValNum::Var)); + /// ``` + /// When the output is any kind of list, field metadata may be used to represent the exact + /// input datatype if the input on its own is an inexact match. + /// ``` + /// use arrow_schema::DataType as Arrow; + /// use std::str::FromStr; + /// use tiledb_common::array::CellValNum; + /// use tiledb_common::datatype::Datatype as TileDB; + /// use tiledb_common::datatype::arrow::{to_arrow, DatatypeToArrowResult, ARROW_FIELD_METADATA_KEY_TILEDB_TYPE_HINT}; + /// + /// let arrow = to_arrow(&TileDB::StringAscii, CellValNum::Var); + /// let DatatypeToArrowResult::Exact(Arrow::LargeList(item)) = arrow else { unreachable!() }; + /// assert_eq!(*item.data_type(), Arrow::UInt8); + /// + /// let Some(s) = item.metadata().get(ARROW_FIELD_METADATA_KEY_TILEDB_TYPE_HINT) + /// else { unreachable!() }; + /// assert_eq!(Some(TileDB::StringAscii), TileDB::from_str(s).ok()); + /// ``` + Exact(arrow_schema::DataType), + /// There is no corresponding logical data type, but a physical data type + /// with the same bit width can be used to represent primitive values, + /// and there is a trivial or cheap conversion between value structural data. + /// ``` + /// use tiledb_common::array::CellValNum; + /// use tiledb_common::datatype::Datatype as TileDB; + /// use tiledb_common::datatype::arrow::{to_arrow, DatatypeToArrowResult}; + /// + /// assert_eq!(DatatypeToArrowResult::Inexact(arrow_schema::DataType::UInt8), + /// to_arrow(&TileDB::StringAscii, CellValNum::single())); + /// ``` + Inexact(arrow_schema::DataType), +} + +impl DatatypeToArrowResult { + pub fn is_inexact(&self) -> bool { + matches!(self, Self::Inexact(_)) + } + + pub fn is_exact(&self) -> bool { + matches!(self, Self::Exact(_)) + } + + pub fn into_inner(self) -> arrow_schema::DataType { + match self { + Self::Exact(arrow) => arrow, + Self::Inexact(arrow) => arrow, + } + } +} + +/* + * (Datatype::StringAscii, CellValNum::Var) does not have an exact analog in Arrow. + * Utf8 sounds pretty good, but we can't use it because Arrow validates Utf8 and + * tiledb does not. So we use `LargeList(UInt8)` instead. + * However, in tiledb StringAscii has several special accommodations which + * are not granted to UInt8. We must be able to invert back to StringAscii. + * We can do that by storing the exact input datatype on the arrow list field metadata. + */ +/// `arrow_schema::Field` metadata key for the original `tiledb_common::datatype::Datatype` variant +/// if there is no exact mapping from `tiledb_common::datatype::Datatype` to `arrow_schema::DataType`. +pub const ARROW_FIELD_METADATA_KEY_TILEDB_TYPE_HINT: &str = "tiledb_type_hint"; + +pub fn to_arrow( + datatype: &Datatype, + cell_val_num: CellValNum, +) -> DatatypeToArrowResult { + use arrow_schema::DataType as ADT; + + type Res = DatatypeToArrowResult; + + match cell_val_num { + CellValNum::Fixed(nz) if nz.get() == 1 => { + match datatype { + Datatype::Int8 => Res::Exact(ADT::Int8), + Datatype::Int16 => Res::Exact(ADT::Int16), + Datatype::Int32 => Res::Exact(ADT::Int32), + Datatype::Int64 => Res::Exact(ADT::Int64), + Datatype::UInt8 => Res::Exact(ADT::UInt8), + Datatype::UInt16 => Res::Exact(ADT::UInt16), + Datatype::UInt32 => Res::Exact(ADT::UInt32), + Datatype::UInt64 => Res::Exact(ADT::UInt64), + Datatype::Float32 => Res::Exact(ADT::Float32), + Datatype::Float64 => Res::Exact(ADT::Float64), + Datatype::DateTimeSecond => { + Res::Exact(ADT::Timestamp(TimeUnit::Second, None)) + } + Datatype::DateTimeMillisecond => { + Res::Exact(ADT::Timestamp(TimeUnit::Millisecond, None)) + } + Datatype::DateTimeMicrosecond => { + Res::Exact(ADT::Timestamp(TimeUnit::Microsecond, None)) + } + Datatype::DateTimeNanosecond => { + Res::Exact(ADT::Timestamp(TimeUnit::Nanosecond, None)) + } + Datatype::TimeMicrosecond => { + Res::Exact(ADT::Time64(TimeUnit::Microsecond)) + } + Datatype::TimeNanosecond => { + Res::Exact(ADT::Time64(TimeUnit::Nanosecond)) + } + Datatype::Char => Res::Inexact(ADT::Int8), + Datatype::StringAscii => Res::Inexact(ADT::UInt8), + Datatype::StringUtf8 => Res::Inexact(ADT::UInt8), + Datatype::StringUtf16 => Res::Inexact(ADT::UInt16), + Datatype::StringUtf32 => Res::Inexact(ADT::UInt32), + Datatype::StringUcs2 => Res::Inexact(ADT::UInt16), + Datatype::StringUcs4 => Res::Inexact(ADT::UInt32), + Datatype::DateTimeDay + | Datatype::DateTimeYear + | Datatype::DateTimeMonth + | Datatype::DateTimeWeek + | Datatype::DateTimeHour + | Datatype::DateTimeMinute + | Datatype::DateTimePicosecond + | Datatype::DateTimeFemtosecond + | Datatype::DateTimeAttosecond + | Datatype::TimeHour + | Datatype::TimeMinute + | Datatype::TimeSecond + | Datatype::TimeMillisecond + | Datatype::TimePicosecond + | Datatype::TimeFemtosecond + | Datatype::TimeAttosecond => { + // these are signed 64-bit integers in tiledb, + // arrow datetypes with the same precision are 32 bits + // (or there is no equivalent time unit) + Res::Inexact(ADT::Int64) + } + Datatype::Blob + | Datatype::Boolean + | Datatype::GeometryWkb + | Datatype::GeometryWkt => Res::Inexact(ADT::UInt8), + Datatype::Any => { + // note that this likely is unreachable if the tiledb API is used + // correctly, as `Datatype::Any` requires `CellValNum::Var` + Res::Inexact(ADT::UInt8) + } + } + } + CellValNum::Fixed(nz) => match i32::try_from(nz.get()) { + Ok(nz) => { + if matches!(datatype, Datatype::Blob) { + Res::Exact(ADT::FixedSizeBinary(nz)) + } else { + match to_arrow(datatype, CellValNum::single()) { + Res::Exact(item) => Res::Exact(ADT::FixedSizeList( + Arc::new(arrow_schema::Field::new_list_field( + item, false, + )), + nz, + )), + Res::Inexact(item) => { + let metadata = HashMap::from_iter([( + ARROW_FIELD_METADATA_KEY_TILEDB_TYPE_HINT + .to_string(), + datatype.to_string(), + )]); + + let item = Arc::new( + Field::new_list_field(item, false) + .with_metadata(metadata), + ); + Res::Exact(ADT::FixedSizeList(item, nz)) + } + } + } + } + Err(_) => unimplemented!(), + }, + CellValNum::Var => { + if let Datatype::Blob = datatype { + Res::Exact(ADT::LargeBinary) + } else { + /* + * TODO: + * We could, and probably ought to, treat Utf8 in a similar fashion + * to LargeBinary as above. However, arrow (in contrast to tiledb) + * actually does to a UTF-8 integrity check. Until tiledb also + * does that, and we update our test strategies to generate + * valid UTF-8 sequences, we cannot do so. + */ + match to_arrow(datatype, CellValNum::single()) { + Res::Exact(item) => { + let item = Arc::new(Field::new_list_field(item, false)); + Res::Exact(ADT::LargeList(item)) + } + Res::Inexact(item) => { + let metadata = HashMap::from_iter([( + ARROW_FIELD_METADATA_KEY_TILEDB_TYPE_HINT + .to_string(), + datatype.to_string(), + )]); + let item = Arc::new( + Field::new_list_field(item, false) + .with_metadata(metadata), + ); + Res::Exact(ADT::LargeList(item)) + } + } + } + } + } +} + +/// Represents arrow type compatibility for a tiledb `Datatype` paired with a `CellValNum`. +#[derive(Copy, Clone, Debug, Eq, PartialEq)] +pub enum DatatypeFromArrowResult { + /// There is no reasonable matching type in tiledb. + /// This includes, but is not limited to, + /// types with 32-bit offsets; complex data types; view types; decimal types; and the null type. + None, + /// There is an exact logical match for the arrow `DataType`. + /// The individual values of the respective types have the same bit width + /// and are meant to be interpreted the same way. + /// ``` + /// use arrow_schema::DataType as Arrow; + /// use tiledb_common::array::CellValNum; + /// use tiledb_common::datatype::Datatype as TileDB; + /// use tiledb_common::datatype::arrow::{from_arrow, DatatypeFromArrowResult}; + /// + /// let tiledb = from_arrow(&Arrow::new_large_list(Arrow::Date32, false)); + /// assert_eq!(DatatypeFromArrowResult::Inexact(TileDB::Int32, CellValNum::Var), tiledb); + /// ``` + Exact(Datatype, CellValNum), + /// There is no corresponding logical data type, but a physical data type + /// with the same bit width can be used to represent primitive values, + /// and there is a trivial or cheap conversion between value structural data. + /// ``` + /// use arrow_schema::DataType as Arrow; + /// use tiledb_common::array::CellValNum; + /// use tiledb_common::datatype::Datatype as TileDB; + /// use tiledb_common::datatype::arrow::{from_arrow, DatatypeFromArrowResult}; + /// + /// let tiledb = from_arrow(&Arrow::Date32); + /// assert_eq!(DatatypeFromArrowResult::Inexact(TileDB::Int32, CellValNum::single()), tiledb); + /// ``` + Inexact(Datatype, CellValNum), +} + +impl DatatypeFromArrowResult { + pub fn is_inexact(&self) -> bool { + matches!(self, Self::Inexact(_, _)) + } + + pub fn is_exact(&self) -> bool { + matches!(self, Self::Exact(_, _)) + } + + pub fn ok(self) -> Option<(Datatype, CellValNum)> { + match self { + Self::None => None, + Self::Exact(dt, cv) => Some((dt, cv)), + Self::Inexact(dt, cv) => Some((dt, cv)), + } + } +} + +pub fn from_arrow(value: &arrow_schema::DataType) -> DatatypeFromArrowResult { + use arrow_schema::DataType as ADT; + + type Res = DatatypeFromArrowResult; + + match value { + ADT::Null => Res::None, + ADT::Int8 => Res::Exact(Datatype::Int8, CellValNum::single()), + ADT::Int16 => Res::Exact(Datatype::Int16, CellValNum::single()), + ADT::Int32 => Res::Exact(Datatype::Int32, CellValNum::single()), + ADT::Int64 => Res::Exact(Datatype::Int64, CellValNum::single()), + ADT::UInt8 => Res::Exact(Datatype::UInt8, CellValNum::single()), + ADT::UInt16 => Res::Exact(Datatype::UInt16, CellValNum::single()), + ADT::UInt32 => Res::Exact(Datatype::UInt32, CellValNum::single()), + ADT::UInt64 => Res::Exact(Datatype::UInt64, CellValNum::single()), + ADT::Float16 => { + /* tiledb has no f16 type, so use u16 as a 2-byte container */ + Res::Inexact(Datatype::UInt16, CellValNum::single()) + } + ADT::Float32 => Res::Exact(Datatype::Float32, CellValNum::single()), + ADT::Float64 => Res::Exact(Datatype::Float64, CellValNum::single()), + ADT::Decimal128(_, _) | ADT::Decimal256(_, _) => { + /* + * We could map this to fixed-length blob but probably + * better to do a proper 128 or 256 bit thing in core + * so we avoid making mistakes here + */ + Res::None + } + ADT::Timestamp(TimeUnit::Second, _) => { + Res::Exact(Datatype::DateTimeSecond, CellValNum::single()) + } + ADT::Timestamp(TimeUnit::Millisecond, _) => { + Res::Exact(Datatype::DateTimeMillisecond, CellValNum::single()) + } + ADT::Timestamp(TimeUnit::Microsecond, _) => { + Res::Exact(Datatype::DateTimeMicrosecond, CellValNum::single()) + } + ADT::Timestamp(TimeUnit::Nanosecond, _) => { + Res::Exact(Datatype::DateTimeNanosecond, CellValNum::single()) + } + ADT::Date32 | ADT::Time32(_) => { + Res::Inexact(Datatype::Int32, CellValNum::single()) + } + ADT::Date64 => { + Res::Inexact(Datatype::DateTimeMillisecond, CellValNum::single()) + } + ADT::Time64(TimeUnit::Microsecond) => { + Res::Exact(Datatype::TimeMicrosecond, CellValNum::single()) + } + ADT::Time64(TimeUnit::Nanosecond) => { + Res::Exact(Datatype::TimeNanosecond, CellValNum::single()) + } + ADT::Time64(_) => Res::Inexact(Datatype::UInt64, CellValNum::single()), + ADT::Boolean => { + /* this may be bit-packed by arrow but is not by tiledb */ + Res::None + } + ADT::Duration(_) | ADT::Interval(_) => { + /* these are scalars but the doc does not specify bit width */ + Res::None + } + ADT::LargeBinary => Res::Exact(Datatype::Blob, CellValNum::Var), + ADT::FixedSizeBinary(len) => match u32::try_from(*len) { + Ok(len) => match NonZeroU32::new(len) { + None => Res::None, + Some(nz) => Res::Exact(Datatype::Blob, CellValNum::Fixed(nz)), + }, + Err(_) => Res::None, + }, + ADT::FixedSizeList(ref item, ref len) => { + let len = match u32::try_from(*len).ok().and_then(NonZeroU32::new) { + Some(len) => len, + None => return Res::None, + }; + if item.is_nullable() { + // tiledb validity applies to the entire cell, not the values within the cell. + // there is currently no way to represent null values within a cell + Res::None + } else if item.data_type().primitive_width().is_none() { + /* + * probably there are some cases we can handle, + * but let's omit for now + */ + Res::None + } else if let Some(exact_datatype) = item + .metadata() + .get(ARROW_FIELD_METADATA_KEY_TILEDB_TYPE_HINT) + .and_then(|s| Datatype::from_str(s).ok()) + { + Res::Exact(exact_datatype, CellValNum::Fixed(len)) + } else { + match from_arrow(item.data_type()) { + Res::None => Res::None, + Res::Inexact(item, item_cell_val) => { + let cell_val_num = match item_cell_val { + CellValNum::Fixed(nz) => { + match nz.checked_mul(len) { + None => return Res::None, + Some(nz) => CellValNum::Fixed(nz), + } + } + CellValNum::Var => CellValNum::Var, + }; + Res::Inexact(item, cell_val_num) + } + Res::Exact(item, item_cell_val) => { + let cell_val_num = match item_cell_val { + CellValNum::Fixed(nz) => { + match nz.checked_mul(len) { + None => return Res::None, + Some(nz) => CellValNum::Fixed(nz), + } + } + CellValNum::Var => CellValNum::Var, + }; + Res::Exact(item, cell_val_num) + } + } + } + } + ADT::LargeUtf8 => { + /* + * NB: arrow checks for valid UTF-8 but tiledb does not. + * This is not an exact conversion for that reason + * because we cannot guarantee invertibility. + */ + Res::Inexact(Datatype::StringUtf8, CellValNum::Var) + } + ADT::LargeList(ref item) => { + if item.is_nullable() { + // tiledb validity applies to the entire cell, not the values within the cell. + // there is currently no way to represent null values within a cell + Res::None + } else if item.data_type().primitive_width().is_none() { + /* + * probably there are some cases we can handle, + * but let's omit for now + */ + Res::None + } else if let Some(exact_datatype) = item + .metadata() + .get(ARROW_FIELD_METADATA_KEY_TILEDB_TYPE_HINT) + .and_then(|s| Datatype::from_str(s).ok()) + { + Res::Exact(exact_datatype, CellValNum::Var) + } else { + match from_arrow(item.data_type()) { + Res::None => Res::None, + Res::Inexact(item, CellValNum::Fixed(nz)) + if nz.get() == 1 => + { + Res::Inexact(item, CellValNum::Var) + } + Res::Exact(item, CellValNum::Fixed(nz)) + if nz.get() == 1 => + { + Res::Exact(item, CellValNum::Var) + } + _ => { + /* + * We probably *can* fill in more cases, but either: + * 1) we need to do work to keep the fixed cell val num around, doable but + * why bother right now + * 2) we need to keep multiple levels of offsets, not supported right now + */ + Res::None + } + } + } + } + ADT::Binary | ADT::Utf8 | ADT::List(_) => { + /* offsets are 64 bits, these types use 32-bit offsets */ + Res::None + } + ADT::BinaryView + | ADT::Utf8View + | ADT::ListView(_) + | ADT::LargeListView(_) => { + /* data does not arrive from tiledb core in this format */ + Res::None + } + ADT::Struct(_) + | ADT::Union(_, _) + | ADT::Dictionary(_, _) + | ADT::Map(_, _) + | ADT::RunEndEncoded(_, _) => { + /* complex types are not implemented */ + Res::None + } + } +} + +#[cfg(any(test, feature = "proptest-strategies"))] +pub mod strategy { + use std::collections::HashMap; + + use proptest::prelude::*; + + use super::*; + + #[derive(Clone, Debug)] + pub struct FieldParameters { + pub min_fixed_binary_len: i32, + pub max_fixed_binary_len: i32, + pub min_numeric_precision: u8, + pub max_numeric_precision: u8, + pub min_fixed_list_len: i32, + pub max_fixed_list_len: i32, + pub min_struct_fields: usize, + pub max_struct_fields: usize, + pub min_recursion_depth: u32, + pub max_recursion_depth: u32, + } + + impl Default for FieldParameters { + fn default() -> Self { + const DEFAULT_MAX_FIXED_BINARY_LEN: i32 = 1024 * 1024; + const DEFAULT_MAX_FIXED_LIST_LEN: i32 = 2048; + + FieldParameters { + min_fixed_binary_len: 1, + max_fixed_binary_len: DEFAULT_MAX_FIXED_BINARY_LEN, + min_numeric_precision: 1, + max_numeric_precision: u8::MAX, + min_fixed_list_len: 0, + max_fixed_list_len: DEFAULT_MAX_FIXED_LIST_LEN, + min_struct_fields: 0, + max_struct_fields: 16, + min_recursion_depth: 0, + max_recursion_depth: 8, + } + } + } + + pub fn any_datatype( + params: FieldParameters, + ) -> impl Strategy { + use arrow_schema::{ + DataType as ADT, Field, Fields, IntervalUnit, TimeUnit, + }; + + let leaf = prop_oneof![ + Just(ADT::Null), + Just(ADT::Int8), + Just(ADT::Int16), + Just(ADT::Int32), + Just(ADT::Int64), + Just(ADT::UInt8), + Just(ADT::UInt16), + Just(ADT::UInt32), + Just(ADT::UInt64), + Just(ADT::Float16), + Just(ADT::Float32), + Just(ADT::Float64), + Just(ADT::Timestamp(TimeUnit::Second, None)), + Just(ADT::Timestamp(TimeUnit::Millisecond, None)), + Just(ADT::Timestamp(TimeUnit::Microsecond, None)), + Just(ADT::Timestamp(TimeUnit::Nanosecond, None)), + Just(ADT::Date32), + Just(ADT::Date64), + Just(ADT::Time32(TimeUnit::Second)), + Just(ADT::Time32(TimeUnit::Millisecond)), + Just(ADT::Time64(TimeUnit::Microsecond)), + Just(ADT::Time64(TimeUnit::Nanosecond)), + Just(ADT::Duration(TimeUnit::Second)), + Just(ADT::Duration(TimeUnit::Millisecond)), + Just(ADT::Duration(TimeUnit::Nanosecond)), + Just(ADT::Interval(IntervalUnit::YearMonth)), + Just(ADT::Interval(IntervalUnit::DayTime)), + Just(ADT::Interval(IntervalUnit::MonthDayNano)), + Just(ADT::Binary), + (params.min_fixed_binary_len..=params.max_fixed_binary_len) + .prop_map(ADT::FixedSizeBinary), + Just(ADT::LargeBinary), + Just(ADT::Utf8), + Just(ADT::LargeUtf8), + (params.min_numeric_precision..=params.max_numeric_precision) + .prop_flat_map(|precision| ( + Just(precision), + (0..precision.clamp(0, i8::MAX as u8) as i8) + ) + .prop_map(|(precision, scale)| ADT::Decimal128( + precision, scale + ))), + (params.min_numeric_precision..=params.max_numeric_precision) + .prop_flat_map(|precision| ( + Just(precision), + (0..precision.clamp(0, i8::MAX as u8) as i8) + ) + .prop_map(|(precision, scale)| ADT::Decimal256( + precision, scale + ))), + ]; + + leaf.prop_recursive( + params.max_recursion_depth, + params.max_recursion_depth * 4, + std::cmp::max( + 2, + (params.max_struct_fields / 4).try_into().unwrap(), + ), + move |strategy| { + prop_oneof![ + (strategy.clone(), any::()) + .prop_map(|(s, b)| ADT::new_list(s, b)), + ( + strategy.clone(), + params.min_fixed_list_len..=params.max_fixed_list_len, + any::() + ) + .prop_map(|(s, l, b)| { + ADT::FixedSizeList( + Arc::new(Field::new_list_field(s, b)), + l, + ) + }), + (strategy.clone(), any::()).prop_map(|(s, b)| { + ADT::LargeList(Arc::new(Field::new_list_field(s, b))) + }), + proptest::collection::vec( + (prop_field_name(), strategy.clone(), any::()), + params.min_struct_fields..=params.max_struct_fields + ) + .prop_map(|v| ADT::Struct( + v.into_iter() + .map(|(n, dt, b)| Field::new(n, dt, b)) + .collect::() + )) // union goes here + // dictionary goes here + // map goes here + // run-end encoded goes here + ] + }, + ) + } + + pub fn prop_field_name() -> impl Strategy { + proptest::string::string_regex("[a-zA-Z0-9_]*") + .expect("Error creating attribute name strategy") + .prop_filter( + "Attribute names may not begin with reserved prefix", + |name| !name.starts_with("__"), + ) + } + + pub fn prop_arrow_field() -> impl Strategy { + ( + prop_field_name(), + any_datatype(Default::default()), + any::(), + Just(HashMap::::new()), /* TODO: we'd like to check that metadata is preserved, + * but right now the CAPI doesn't appear to have a way + * to attach metadata to an attribute + */ + ) + .prop_map(|(name, data_type, nullable, metadata)| { + Field::new(name, data_type, nullable).with_metadata(metadata) + }) + } +} + +pub fn is_physical_type_match( + arrow_in: &arrow_schema::DataType, + arrow_out: &arrow_schema::DataType, +) -> bool { + if arrow_in == arrow_out { + return true; + } + + /* otherwise check some inexact compatibilities */ + use arrow_schema::DataType as ADT; + match (arrow_in, arrow_out) { + ( + ADT::FixedSizeList(ref item_in, len_in), + ADT::FixedSizeList(ref item_out, len_out), + ) => { + len_in == len_out + && is_physical_type_match( + item_in.data_type(), + item_out.data_type(), + ) + } + (ADT::LargeList(ref item_in), ADT::LargeList(ref item_out)) => { + is_physical_type_match(item_in.data_type(), item_out.data_type()) + } + (ADT::FixedSizeList(ref item_in, 1), dt_out) => { + /* + * fixed size list of 1 element should have no extra data, + * we probably don't need to keep the FixedSizeList part + * for correctness, punt on it for now and see if we need + * to deal with it later + */ + is_physical_type_match(item_in.data_type(), dt_out) + } + (ADT::LargeUtf8, ADT::LargeList(ref item)) => { + /* + * Arrow does checked UTF-8, tiledb does not, + * so we must permit this inexactness + */ + *item.data_type() == arrow_schema::DataType::UInt8 + && !item.is_nullable() + } + (dt_in, dt_out) => { + if dt_in.is_primitive() { + dt_in.primitive_width() == dt_out.primitive_width() + } else { + false + } + } + } +} + +#[cfg(test)] +pub mod tests { + use super::*; + use proptest::prelude::*; + + fn do_to_arrow_single(tdb_dt: Datatype) { + let cell_val_num = CellValNum::single(); + let arrow_dt = to_arrow(&tdb_dt, cell_val_num); + match arrow_dt { + DatatypeToArrowResult::Inexact(arrow) => { + assert!(arrow.is_primitive()); + let arrow_size = arrow.primitive_width().unwrap(); + assert_eq!( + tdb_dt.size(), + arrow_size, + "to_arrow({}, {:?}) = {}", + tdb_dt, + cell_val_num, + arrow + ); + + let tdb_out = from_arrow(&arrow); + let (tdb_out, cell_val_num_out) = tdb_out.ok().unwrap(); + + /* the datatype should not match exactly but it must be the same size */ + assert_ne!(tdb_dt, tdb_out); + assert_eq!(tdb_dt.size(), tdb_out.size()); + assert_eq!(cell_val_num, cell_val_num_out); + } + DatatypeToArrowResult::Exact(arrow) => { + assert!(arrow.is_primitive()); + let arrow_size = arrow.primitive_width().unwrap(); + assert_eq!( + tdb_dt.size(), + arrow_size, + "to_arrow({}, {:?}) = {}", + tdb_dt, + cell_val_num, + arrow + ); + + let tdb_out = from_arrow(&arrow); + if let DatatypeFromArrowResult::Exact( + tdb_out, + cell_val_num_out, + ) = tdb_out + { + /* the datatype must match exactly */ + assert_eq!(tdb_dt, tdb_out); + assert_eq!(cell_val_num, cell_val_num_out); + } else { + unreachable!( + "Exact conversion did not invert, found {:?}", + tdb_out + ) + } + } + } + } + + fn do_to_arrow_nonvar(tdb_dt: Datatype) { + let fixed_len_in = 32u32; + let cell_val_num = CellValNum::try_from(fixed_len_in).unwrap(); + let arrow_dt = to_arrow(&tdb_dt, cell_val_num); + + use arrow_schema::DataType as ADT; + match arrow_dt { + DatatypeToArrowResult::Inexact(arrow) => { + match arrow { + ADT::FixedSizeList(ref item, fixed_len_out) => { + let item_expect = + to_arrow(&tdb_dt, CellValNum::single()); + if let DatatypeToArrowResult::Inexact(item_expect) = + item_expect + { + assert_eq!(item_expect, *item.data_type()); + assert_eq!(fixed_len_in, fixed_len_out as u32); + } else { + unreachable!( + "Expected inexact item match, found {:?}", + item_expect + ) + } + } + arrow => unreachable!( + "Expected FixedSizeList for inexact match but found {}", + arrow + ), + } + + /* invertibility */ + let tdb_out = from_arrow(&arrow); + let (tdb_out, cell_val_num_out) = tdb_out.ok().unwrap(); + + /* inexact match will not be eq, but must be the same size */ + assert_eq!(tdb_dt.size(), tdb_out.size()); + assert_eq!(cell_val_num, cell_val_num_out); + } + DatatypeToArrowResult::Exact(arrow) => { + match arrow { + ADT::FixedSizeList(ref item, fixed_len_out) => { + if let Some(sub_exact) = item + .metadata() + .get(ARROW_FIELD_METADATA_KEY_TILEDB_TYPE_HINT) + { + let sub_exact = + Datatype::from_str(sub_exact).unwrap(); + assert_eq!(sub_exact.size(), tdb_dt.size()); + + // item must have been inexact, else we would not have the metadata + let item_dt = + to_arrow(&tdb_dt, CellValNum::single()); + if let DatatypeToArrowResult::Inexact(item_dt) = + item_dt + { + assert_eq!(*item.data_type(), item_dt); + } else { + unreachable!( + "Expected inexact item match but found {:?}", + item_dt + ) + } + } else { + // item must be exact match + let item_dt = + to_arrow(&tdb_dt, CellValNum::single()); + if let DatatypeToArrowResult::Exact(item_dt) = + item_dt + { + assert_eq!(*item.data_type(), item_dt); + } else { + unreachable!( + "Expected exact item match but found {:?}", + item_dt + ) + } + } + assert_eq!(fixed_len_in, fixed_len_out as u32); + } + ADT::FixedSizeBinary(fixed_len_out) => { + assert_eq!(tdb_dt, Datatype::Blob); + assert_eq!(fixed_len_in, fixed_len_out as u32); + } + adt => unreachable!( + "to_arrow({}, {:?}) = {}", + tdb_dt, cell_val_num, adt + ), + } + + /* invertibility */ + let tdb_out = from_arrow(&arrow); + if let DatatypeFromArrowResult::Exact( + tdb_out, + cell_val_num_out, + ) = tdb_out + { + assert_eq!(tdb_dt, tdb_out); + assert_eq!(cell_val_num, cell_val_num_out); + } else { + unreachable!( + "Arrow datatype did not invert, found {:?}", + tdb_out + ) + } + } + } + } + + fn do_to_arrow_var(tdb_dt: Datatype) { + let cell_val_num = CellValNum::Var; + let arrow_dt = to_arrow(&tdb_dt, cell_val_num); + + use arrow_schema::DataType as ADT; + match arrow_dt { + DatatypeToArrowResult::Inexact(arrow) => { + assert!( + !arrow.is_primitive(), + "to_arrow({}, {:?}) = {}", + tdb_dt, + cell_val_num, + arrow + ); + + if let ADT::LargeList(ref item) = arrow { + let item_expect = to_arrow(&tdb_dt, CellValNum::single()); + if let DatatypeToArrowResult::Inexact(item_expect) = + item_expect + { + assert_eq!(*item.data_type(), item_expect); + } else { + unreachable!( + "Expected inexact item match, but found {:?}", + item_expect + ) + } + } else { + /* other possibilities should be Exact */ + unreachable!( + "Expected LargeList for inexact match but found {:?}", + arrow + ) + } + + let tdb_out = from_arrow(&arrow); + let (tdb_out, cell_val_num_out) = tdb_out.ok().unwrap(); + + /* must be the same size */ + assert_eq!(tdb_dt.size(), tdb_out.size()); + assert_eq!(cell_val_num, cell_val_num_out); + } + DatatypeToArrowResult::Exact(arrow) => { + assert!( + !arrow.is_primitive(), + "to_arrow({}, {:?}) = {}", + tdb_dt, + cell_val_num, + arrow + ); + + match arrow { + ADT::LargeList(ref item) => { + if let Some(sub_exact) = item + .metadata() + .get(ARROW_FIELD_METADATA_KEY_TILEDB_TYPE_HINT) + { + let sub_exact = + Datatype::from_str(sub_exact).unwrap(); + assert_eq!(sub_exact.size(), tdb_dt.size()); + + // item must not have been exact, else we would not have the metadata + let item_dt = + to_arrow(&tdb_dt, CellValNum::single()); + if let DatatypeToArrowResult::Inexact(item_dt) = + item_dt + { + assert_eq!(*item.data_type(), item_dt); + } else { + unreachable!( + "Expected inexact item match but found {:?}", + item_dt + ) + } + } else { + let item_dt = + to_arrow(&tdb_dt, CellValNum::single()); + if let DatatypeToArrowResult::Exact(item_dt) = + item_dt + { + assert_eq!(*item.data_type(), item_dt); + } else { + unreachable!( + "Expected exact item match but found {:?}", + item_dt + ) + } + } + } + ADT::LargeUtf8 => assert!(matches!( + tdb_dt, + Datatype::StringAscii | Datatype::StringUtf8 + )), + ADT::LargeBinary => { + assert!(matches!(tdb_dt, Datatype::Blob)) + } + adt => unreachable!( + "to_arrow({}, {:?}) = {}", + tdb_dt, cell_val_num, adt + ), + } + + let tdb_out = from_arrow(&arrow); + if let DatatypeFromArrowResult::Exact( + tdb_out, + cell_val_num_out, + ) = tdb_out + { + assert_eq!(tdb_dt, tdb_out); + assert_eq!(cell_val_num, cell_val_num_out); + } else { + unreachable!( + "Arrow datatype constructed from tiledb datatype must convert back") + } + } + } + } + + fn do_from_arrow(arrow_in: &arrow_schema::DataType) { + match from_arrow(arrow_in) { + DatatypeFromArrowResult::None => (), + DatatypeFromArrowResult::Exact(datatype, cvn) => { + let arrow_out = to_arrow(&datatype, cvn); + if let DatatypeToArrowResult::Exact(arrow_out) = arrow_out { + if let arrow_schema::DataType::FixedSizeList(element, 1) = + arrow_in + { + // FixedSizeList with length 1 has no way to indicate "list" + // for tiledb, so when converting back we lose the FixedSizeList + assert_eq!(*element.data_type(), arrow_out); + } else { + assert_eq!(*arrow_in, arrow_out); + } + } else { + unreachable!( + "Expected exact inversion, found {:?}", + arrow_out + ) + } + } + DatatypeFromArrowResult::Inexact(datatype, cvn) => { + let arrow_out = to_arrow(&datatype, cvn); + let arrow_out = arrow_out.into_inner(); + assert!( + is_physical_type_match(arrow_in, &arrow_out), + "{:?} => {:?}", + arrow_in, + arrow_out + ); + } + } + } + + proptest! { + #[test] + fn test_to_arrow_single(tdb_dt in any::()) { + do_to_arrow_single(tdb_dt) + } + + #[test] + fn test_to_arrow_nonvar(tdb_dt in any::()) { + do_to_arrow_nonvar(tdb_dt); + } + + #[test] + fn test_to_arrow_var(tdb_dt in any::()) { + do_to_arrow_var(tdb_dt); + } + + #[test] + fn test_from_arrow(arrow in super::strategy::prop_arrow_field()) { + do_from_arrow(arrow.data_type()); + } + } +} diff --git a/tiledb/api/src/datatype/logical.rs b/tiledb/common/src/datatype/logical.rs similarity index 91% rename from tiledb/api/src/datatype/logical.rs rename to tiledb/common/src/datatype/logical.rs index 42aabeb4..00f3baf4 100644 --- a/tiledb/api/src/datatype/logical.rs +++ b/tiledb/common/src/datatype/logical.rs @@ -156,32 +156,6 @@ macro_rules! datetime_type { } } -/* -declare_datetime!( - DateTimeYear, - DateTimeMonth, - DateTimeWeek, - DateTimeDay, - DateTimeHour, - DateTimeMinute, - DateTimeSecond, - DateTimeMillisecond, - DateTimeMicrosecond, - DateTimeNanosecond, - DateTimePicosecond, - DateTimeFemtosecond, - DateTimeAttosecond, - TimeHour, - TimeMinute, - TimeSecond, - TimeMillisecond, - TimeMicrosecond, - TimeNanosecond, - TimePicosecond, - TimeFemtosecond, - TimeAttosecond, -); -*/ datetime_type!( DateTimeYear, DateTimeMonth, diff --git a/tiledb/common/src/datatype/mod.rs b/tiledb/common/src/datatype/mod.rs new file mode 100644 index 00000000..bd550a98 --- /dev/null +++ b/tiledb/common/src/datatype/mod.rs @@ -0,0 +1,1063 @@ +pub mod logical; +pub mod physical; + +pub use logical::*; +pub use physical::{PhysicalType, PhysicalValue}; + +use std::fmt::{Debug, Display, Formatter, Result as FmtResult}; +use std::str::FromStr; + +use thiserror::Error; + +#[cfg(feature = "option-subset")] +use tiledb_utils::option::OptionSubset; + +#[cfg(feature = "serde")] +use serde::{Deserialize, Serialize}; + +#[derive(Clone, Debug, Eq, Error, PartialEq)] +pub enum Error { + #[error("Physical type mismatch: expected {actual_type}, found {requested_type}")] + PhysicalTypeMismatch { + requested_type: &'static str, + actual_type: &'static str, + }, + #[error("Physical type '{physical_type}' is not compatible with logical type '{logical_type}'")] + PhysicalTypeIncompatible { + physical_type: &'static str, + logical_type: Datatype, + }, + #[error( + "Logical type mismatch: expected {target_type}, found {source_type}" + )] + LogicalTypeMismatch { + source_type: Datatype, + target_type: Datatype, + }, +} + +impl Error { + pub fn physical_type_mismatch() -> Self { + Self::PhysicalTypeMismatch { + requested_type: std::any::type_name::(), + actual_type: std::any::type_name::(), + } + } + + pub fn physical_type_incompatible(logical_type: Datatype) -> Self { + Self::PhysicalTypeIncompatible { + physical_type: std::any::type_name::(), + logical_type, + } + } +} + +#[derive(Clone, Copy, Debug, Default, Eq, Hash, PartialEq)] +#[cfg_attr(feature = "serde", derive(Deserialize, Serialize))] +#[repr(u64)] +pub enum Datatype { + /// A 32-bit signed integer + Int32, + /// A 64-bit signed integer + Int64, + /// A 32-bit floating point value + Float32, + /// A 64-bit floating point value + Float64, + /// An 8-bit character value + Char, + /// An 8-bit signed integer + Int8, + /// An 8-bit unsigned integer + UInt8, + /// A 16-bit signed integer + Int16, + /// A 16-bit unsigned integer + UInt16, + /// A 32-bit unsigned integer + UInt32, + /// A 64-bit unsigned integer + UInt64, + /// An ASCII string + StringAscii, + /// A UTF-8 string + StringUtf8, + /// A UTF-16 string + StringUtf16, + /// A UTF-32 string + StringUtf32, + /// A UCS2 string + StringUcs2, + /// A UCS4 string + StringUcs4, + /// An arbitrary type + // Any is default to cause an error if we forget to set it on either a + // DimensionData or AttributeData instance. + #[default] + Any, + /// DateTime with year resolution + DateTimeYear, + /// DateTime with month resolution + DateTimeMonth, + /// DateTime with week resolution + DateTimeWeek, + /// DateTime with day resolution + DateTimeDay, + /// DateTime with hour resolution + DateTimeHour, + /// DateTime with minute resolution + DateTimeMinute, + /// DateTime with second resolution + DateTimeSecond, + /// DateTime with millisecond resolution + DateTimeMillisecond, + /// DateTime with microsecond resolution + DateTimeMicrosecond, + /// DateTime with nanosecond resolution + DateTimeNanosecond, + /// DateTime with picosecond resolution + DateTimePicosecond, + /// DateTime with femtosecond resolution + DateTimeFemtosecond, + /// DateTime with attosecond resolution + DateTimeAttosecond, + /// Time with hour resolution + TimeHour, + /// Time with minute resolution + TimeMinute, + /// Time with second resolution + TimeSecond, + /// Time with millisecond resolution + TimeMillisecond, + /// Time with nanosecond resolution + TimeMicrosecond, + /// Time with nanosecond resolution + TimeNanosecond, + /// Time with picosecond resolution + TimePicosecond, + /// Time with femtosecond resolution + TimeFemtosecond, + /// Time with attosecond resolution + TimeAttosecond, + /// Byte sequence + Blob, + /// Boolean + Boolean, + /// A Geometry in well-known binary (WKB) format + GeometryWkb, + /// A Geometry in well-known text (WKT) format + GeometryWkt, +} + +pub const DATATYPES: [Datatype; 43] = [ + Datatype::Int32, + Datatype::Int64, + Datatype::Float32, + Datatype::Float64, + Datatype::Char, + Datatype::Int8, + Datatype::UInt8, + Datatype::Int16, + Datatype::UInt16, + Datatype::UInt32, + Datatype::UInt64, + Datatype::StringAscii, + Datatype::StringUtf8, + Datatype::StringUtf16, + Datatype::StringUtf32, + Datatype::StringUcs2, + Datatype::StringUcs4, + Datatype::DateTimeYear, + Datatype::DateTimeMonth, + Datatype::DateTimeWeek, + Datatype::DateTimeDay, + Datatype::DateTimeHour, + Datatype::DateTimeMinute, + Datatype::DateTimeSecond, + Datatype::DateTimeMillisecond, + Datatype::DateTimeMicrosecond, + Datatype::DateTimeNanosecond, + Datatype::DateTimePicosecond, + Datatype::DateTimeFemtosecond, + Datatype::DateTimeAttosecond, + Datatype::TimeHour, + Datatype::TimeMinute, + Datatype::TimeSecond, + Datatype::TimeMillisecond, + Datatype::TimeMicrosecond, + Datatype::TimeNanosecond, + Datatype::TimePicosecond, + Datatype::TimeFemtosecond, + Datatype::TimeAttosecond, + Datatype::Blob, + Datatype::Boolean, + Datatype::GeometryWkb, + Datatype::GeometryWkt, +]; + +pub const DENSE_DIMENSION_DATATYPES: [Datatype; 30] = [ + Datatype::Int8, + Datatype::Int16, + Datatype::Int32, + Datatype::Int64, + Datatype::UInt8, + Datatype::UInt16, + Datatype::UInt32, + Datatype::UInt64, + Datatype::DateTimeYear, + Datatype::DateTimeMonth, + Datatype::DateTimeWeek, + Datatype::DateTimeDay, + Datatype::DateTimeHour, + Datatype::DateTimeMinute, + Datatype::DateTimeSecond, + Datatype::DateTimeMillisecond, + Datatype::DateTimeMicrosecond, + Datatype::DateTimeNanosecond, + Datatype::DateTimePicosecond, + Datatype::DateTimeFemtosecond, + Datatype::DateTimeAttosecond, + Datatype::TimeHour, + Datatype::TimeMinute, + Datatype::TimeSecond, + Datatype::TimeMillisecond, + Datatype::TimeMicrosecond, + Datatype::TimeNanosecond, + Datatype::TimePicosecond, + Datatype::TimeFemtosecond, + Datatype::TimeAttosecond, +]; + +pub const SPARSE_DIMENSION_DATATYPES: [Datatype; 33] = [ + Datatype::Int8, + Datatype::Int16, + Datatype::Int32, + Datatype::Int64, + Datatype::UInt8, + Datatype::UInt16, + Datatype::UInt32, + Datatype::UInt64, + Datatype::Float32, + Datatype::Float64, + Datatype::DateTimeYear, + Datatype::DateTimeMonth, + Datatype::DateTimeWeek, + Datatype::DateTimeDay, + Datatype::DateTimeHour, + Datatype::DateTimeMinute, + Datatype::DateTimeSecond, + Datatype::DateTimeMillisecond, + Datatype::DateTimeMicrosecond, + Datatype::DateTimeNanosecond, + Datatype::DateTimePicosecond, + Datatype::DateTimeFemtosecond, + Datatype::DateTimeAttosecond, + Datatype::TimeHour, + Datatype::TimeMinute, + Datatype::TimeSecond, + Datatype::TimeMillisecond, + Datatype::TimeMicrosecond, + Datatype::TimeNanosecond, + Datatype::TimePicosecond, + Datatype::TimeFemtosecond, + Datatype::TimeAttosecond, + Datatype::StringAscii, +]; + +impl Datatype { + pub fn size(&self) -> usize { + crate::physical_type_go!(self, DT, std::mem::size_of::
()) + } + + pub fn is_compatible_type(&self) -> bool { + use std::any::TypeId; + + let tid = TypeId::of::(); + if tid == TypeId::of::() { + matches!(*self, Datatype::Float32) + } else if tid == TypeId::of::() { + matches!(*self, Datatype::Float64) + } else if tid == TypeId::of::() { + matches!(*self, Datatype::Char | Datatype::Int8) + } else if tid == TypeId::of::() { + matches!( + *self, + Datatype::Any + | Datatype::Blob + | Datatype::Boolean + | Datatype::GeometryWkb + | Datatype::GeometryWkt + | Datatype::StringAscii + | Datatype::StringUtf8 + | Datatype::UInt8 + ) + } else if tid == TypeId::of::() { + matches!(*self, Datatype::Int16) + } else if tid == TypeId::of::() { + matches!( + *self, + Datatype::StringUtf16 | Datatype::StringUcs2 | Datatype::UInt16 + ) + } else if tid == TypeId::of::() { + matches!(*self, Datatype::Int32) + } else if tid == TypeId::of::() { + matches!( + *self, + Datatype::StringUtf32 | Datatype::StringUcs4 | Datatype::UInt32 + ) + } else if tid == TypeId::of::() { + matches!( + *self, + Datatype::Int64 + | Datatype::DateTimeYear + | Datatype::DateTimeMonth + | Datatype::DateTimeWeek + | Datatype::DateTimeDay + | Datatype::DateTimeHour + | Datatype::DateTimeMinute + | Datatype::DateTimeSecond + | Datatype::DateTimeMillisecond + | Datatype::DateTimeMicrosecond + | Datatype::DateTimeNanosecond + | Datatype::DateTimePicosecond + | Datatype::DateTimeFemtosecond + | Datatype::DateTimeAttosecond + | Datatype::TimeHour + | Datatype::TimeMinute + | Datatype::TimeSecond + | Datatype::TimeMillisecond + | Datatype::TimeMicrosecond + | Datatype::TimeNanosecond + | Datatype::TimePicosecond + | Datatype::TimeFemtosecond + | Datatype::TimeAttosecond + ) + } else if tid == TypeId::of::() { + matches!(*self, Datatype::UInt64) + } else { + false + } + } + + /// Returns whether this type is an integral type (i.e. integer) + // Keep in sync with sm/enums/datatype.h::datatype_is_integer + pub fn is_integral_type(&self) -> bool { + matches!( + *self, + Datatype::Boolean + | Datatype::Int8 + | Datatype::Int16 + | Datatype::Int32 + | Datatype::Int64 + | Datatype::UInt8 + | Datatype::UInt16 + | Datatype::UInt32 + | Datatype::UInt64 + ) + } + + /// Returns whether this type is a real number (i.e. floating point) + // Keep in sync with sm/enums/datatype.h::datatype_is_real + pub fn is_real_type(&self) -> bool { + matches!(*self, Datatype::Float32 | Datatype::Float64) + } + + /// Returns whether this type is a variable-length string type + // Keep in sync with sm/enums/datatype.h::datatype_is_string + pub fn is_string_type(&self) -> bool { + matches!( + *self, + Datatype::StringAscii + | Datatype::StringUtf8 + | Datatype::StringUtf16 + | Datatype::StringUtf32 + | Datatype::StringUcs2 + | Datatype::StringUcs4 + ) + } + + /// Returns whether this type is a DateTime type of any resolution + // Keep in sync with sm/enums/datatype.h::datatype_is_datetime + pub fn is_datetime_type(&self) -> bool { + matches!( + *self, + Datatype::DateTimeYear + | Datatype::DateTimeMonth + | Datatype::DateTimeWeek + | Datatype::DateTimeDay + | Datatype::DateTimeHour + | Datatype::DateTimeMinute + | Datatype::DateTimeSecond + | Datatype::DateTimeMillisecond + | Datatype::DateTimeMicrosecond + | Datatype::DateTimeNanosecond + | Datatype::DateTimePicosecond + | Datatype::DateTimeFemtosecond + | Datatype::DateTimeAttosecond + ) + } + + /// Returns whether this type is a Time type of any resolution + // Keep in sync with sm/enums/datatype.h::datatype_is_time + pub fn is_time_type(&self) -> bool { + matches!( + *self, + Datatype::TimeHour + | Datatype::TimeMinute + | Datatype::TimeSecond + | Datatype::TimeMillisecond + | Datatype::TimeMicrosecond + | Datatype::TimeNanosecond + | Datatype::TimePicosecond + | Datatype::TimeFemtosecond + | Datatype::TimeAttosecond + ) + } + + /// Returns whether this type is a byte + // Keep in sync with sm/enums/datatype.h:datatype_is_byte + pub fn is_byte_type(&self) -> bool { + matches!( + *self, + Datatype::Blob | Datatype::GeometryWkb | Datatype::GeometryWkt + ) + } + + /// Returns whether this type can be used as a dimension type of a sparse array + pub fn is_allowed_dimension_type_sparse(&self) -> bool { + !matches!(self, Datatype::Boolean) + && (self.is_integral_type() + || self.is_datetime_type() + || self.is_time_type() + || matches!( + *self, + Datatype::Float32 + | Datatype::Float64 + | Datatype::StringAscii + )) + } + + /// Returns whether this type can be used as a dimension type of a dense array + pub fn is_allowed_dimension_type_dense(&self) -> bool { + !matches!(self, Datatype::Boolean) + && (self.is_integral_type() + || self.is_datetime_type() + || self.is_time_type()) + } + + pub fn same_physical_type(&self, other: &Datatype) -> bool { + crate::physical_type_go!(self, MyPhysicalType, { + crate::physical_type_go!(other, TheirPhysicalType, { + std::any::TypeId::of::() + == std::any::TypeId::of::() + }) + }) + } + + /// Returns an `Iterator` which yields each variant of `Datatype` + /// exactly once in an unspecified order. + pub fn iter() -> impl Iterator { + DATATYPES.iter().copied() + } +} + +impl Display for Datatype { + fn fmt(&self, f: &mut Formatter) -> FmtResult { + Debug::fmt(self, f) + } +} + +impl FromStr for Datatype { + type Err = String; + + fn from_str(s: &str) -> Result { + // NB: we don't use [ffi::tiledb_datatype_from_str] + // so that the [tiledb_common] crate can run without + // linking to libtiledb.so + + let s = s.to_ascii_lowercase(); + match s.as_ref() { + "int8" => Ok(Datatype::Int8), + "int16" => Ok(Datatype::Int16), + "int32" => Ok(Datatype::Int32), + "int64" => Ok(Datatype::Int64), + "float32" => Ok(Datatype::Float32), + "float64" => Ok(Datatype::Float64), + "char" => Ok(Datatype::Char), + "uint8" => Ok(Datatype::UInt8), + "uint16" => Ok(Datatype::UInt16), + "uint32" => Ok(Datatype::UInt32), + "uint64" => Ok(Datatype::UInt64), + "stringascii" => Ok(Datatype::StringAscii), + "stringutf8" => Ok(Datatype::StringUtf8), + "stringutf16" => Ok(Datatype::StringUtf16), + "stringutf32" => Ok(Datatype::StringUtf32), + "stringucs2" => Ok(Datatype::StringUcs2), + "stringucs4" => Ok(Datatype::StringUcs4), + "any" => Ok(Datatype::Any), + "datetimeyear" => Ok(Datatype::DateTimeYear), + "datetimemonth" => Ok(Datatype::DateTimeMonth), + "datetimeweek" => Ok(Datatype::DateTimeWeek), + "datetimeday" => Ok(Datatype::DateTimeDay), + "datetimehour" => Ok(Datatype::DateTimeHour), + "datetimeminute" => Ok(Datatype::DateTimeMinute), + "datetimesecond" => Ok(Datatype::DateTimeSecond), + "datetimemillisecond" => Ok(Datatype::DateTimeMillisecond), + "datetimemicrosecond" => Ok(Datatype::DateTimeMicrosecond), + "datetimenanosecond" => Ok(Datatype::DateTimeNanosecond), + "datetimepicosecond" => Ok(Datatype::DateTimePicosecond), + "datetimefemtosecond" => Ok(Datatype::DateTimeFemtosecond), + "datetimeattosecond" => Ok(Datatype::DateTimeAttosecond), + "timehour" => Ok(Datatype::TimeHour), + "timeminute" => Ok(Datatype::TimeMinute), + "timesecond" => Ok(Datatype::TimeSecond), + "timemillisecond" => Ok(Datatype::TimeMillisecond), + "timemicrosecond" => Ok(Datatype::TimeMicrosecond), + "timenanosecond" => Ok(Datatype::TimeNanosecond), + "timepicosecond" => Ok(Datatype::TimePicosecond), + "timefemtosecond" => Ok(Datatype::TimeFemtosecond), + "timeattosecond" => Ok(Datatype::TimeAttosecond), + "blob" => Ok(Datatype::Blob), + "boolean" => Ok(Datatype::Boolean), + "geometrywkb" => Ok(Datatype::GeometryWkb), + "geometrywkt" => Ok(Datatype::GeometryWkt), + _ => Err(s), + } + } +} + +#[cfg(feature = "option-subset")] +impl OptionSubset for Datatype { + fn option_subset(&self, other: &Self) -> bool { + if let Datatype::Any = *self { + true + } else { + self == other + } + } +} + +impl From for ffi::tiledb_datatype_t { + fn from(value: Datatype) -> Self { + match value { + Datatype::Int8 => ffi::tiledb_datatype_t_TILEDB_INT8, + Datatype::Int16 => ffi::tiledb_datatype_t_TILEDB_INT16, + Datatype::Int32 => ffi::tiledb_datatype_t_TILEDB_INT32, + Datatype::Int64 => ffi::tiledb_datatype_t_TILEDB_INT64, + Datatype::Float32 => ffi::tiledb_datatype_t_TILEDB_FLOAT32, + Datatype::Float64 => ffi::tiledb_datatype_t_TILEDB_FLOAT64, + Datatype::Char => ffi::tiledb_datatype_t_TILEDB_CHAR, + Datatype::UInt8 => ffi::tiledb_datatype_t_TILEDB_UINT8, + Datatype::UInt16 => ffi::tiledb_datatype_t_TILEDB_UINT16, + Datatype::UInt32 => ffi::tiledb_datatype_t_TILEDB_UINT32, + Datatype::UInt64 => ffi::tiledb_datatype_t_TILEDB_UINT64, + Datatype::StringAscii => ffi::tiledb_datatype_t_TILEDB_STRING_ASCII, + Datatype::StringUtf8 => ffi::tiledb_datatype_t_TILEDB_STRING_UTF8, + Datatype::StringUtf16 => ffi::tiledb_datatype_t_TILEDB_STRING_UTF16, + Datatype::StringUtf32 => ffi::tiledb_datatype_t_TILEDB_STRING_UTF32, + Datatype::StringUcs2 => ffi::tiledb_datatype_t_TILEDB_STRING_UCS2, + Datatype::StringUcs4 => ffi::tiledb_datatype_t_TILEDB_STRING_UCS4, + Datatype::Any => ffi::tiledb_datatype_t_TILEDB_ANY, + Datatype::DateTimeYear => { + ffi::tiledb_datatype_t_TILEDB_DATETIME_YEAR + } + Datatype::DateTimeMonth => { + ffi::tiledb_datatype_t_TILEDB_DATETIME_MONTH + } + Datatype::DateTimeWeek => { + ffi::tiledb_datatype_t_TILEDB_DATETIME_WEEK + } + Datatype::DateTimeDay => ffi::tiledb_datatype_t_TILEDB_DATETIME_DAY, + Datatype::DateTimeHour => ffi::tiledb_datatype_t_TILEDB_DATETIME_HR, + Datatype::DateTimeMinute => { + ffi::tiledb_datatype_t_TILEDB_DATETIME_MIN + } + Datatype::DateTimeSecond => { + ffi::tiledb_datatype_t_TILEDB_DATETIME_SEC + } + Datatype::DateTimeMillisecond => { + ffi::tiledb_datatype_t_TILEDB_DATETIME_MS + } + Datatype::DateTimeMicrosecond => { + ffi::tiledb_datatype_t_TILEDB_DATETIME_US + } + Datatype::DateTimeNanosecond => { + ffi::tiledb_datatype_t_TILEDB_DATETIME_NS + } + Datatype::DateTimePicosecond => { + ffi::tiledb_datatype_t_TILEDB_DATETIME_PS + } + Datatype::DateTimeFemtosecond => { + ffi::tiledb_datatype_t_TILEDB_DATETIME_FS + } + Datatype::DateTimeAttosecond => { + ffi::tiledb_datatype_t_TILEDB_DATETIME_AS + } + Datatype::TimeHour => ffi::tiledb_datatype_t_TILEDB_TIME_HR, + Datatype::TimeMinute => ffi::tiledb_datatype_t_TILEDB_TIME_MIN, + Datatype::TimeSecond => ffi::tiledb_datatype_t_TILEDB_TIME_SEC, + Datatype::TimeMillisecond => ffi::tiledb_datatype_t_TILEDB_TIME_MS, + Datatype::TimeMicrosecond => ffi::tiledb_datatype_t_TILEDB_TIME_US, + Datatype::TimeNanosecond => ffi::tiledb_datatype_t_TILEDB_TIME_NS, + Datatype::TimePicosecond => ffi::tiledb_datatype_t_TILEDB_TIME_PS, + Datatype::TimeFemtosecond => ffi::tiledb_datatype_t_TILEDB_TIME_FS, + Datatype::TimeAttosecond => ffi::tiledb_datatype_t_TILEDB_TIME_AS, + Datatype::Blob => ffi::tiledb_datatype_t_TILEDB_BLOB, + Datatype::Boolean => ffi::tiledb_datatype_t_TILEDB_BOOL, + Datatype::GeometryWkb => ffi::tiledb_datatype_t_TILEDB_GEOM_WKB, + Datatype::GeometryWkt => ffi::tiledb_datatype_t_TILEDB_GEOM_WKT, + } + } +} + +impl TryFrom for Datatype { + type Error = TryFromFFIError; + + fn try_from(value: ffi::tiledb_datatype_t) -> Result { + Ok(match value { + ffi::tiledb_datatype_t_TILEDB_INT8 => Datatype::Int8, + ffi::tiledb_datatype_t_TILEDB_INT16 => Datatype::Int16, + ffi::tiledb_datatype_t_TILEDB_INT32 => Datatype::Int32, + ffi::tiledb_datatype_t_TILEDB_INT64 => Datatype::Int64, + ffi::tiledb_datatype_t_TILEDB_FLOAT32 => Datatype::Float32, + ffi::tiledb_datatype_t_TILEDB_FLOAT64 => Datatype::Float64, + ffi::tiledb_datatype_t_TILEDB_CHAR => Datatype::Char, + ffi::tiledb_datatype_t_TILEDB_UINT8 => Datatype::UInt8, + ffi::tiledb_datatype_t_TILEDB_UINT16 => Datatype::UInt16, + ffi::tiledb_datatype_t_TILEDB_UINT32 => Datatype::UInt32, + ffi::tiledb_datatype_t_TILEDB_UINT64 => Datatype::UInt64, + ffi::tiledb_datatype_t_TILEDB_STRING_ASCII => Datatype::StringAscii, + ffi::tiledb_datatype_t_TILEDB_STRING_UTF8 => Datatype::StringUtf8, + ffi::tiledb_datatype_t_TILEDB_STRING_UTF16 => Datatype::StringUtf16, + ffi::tiledb_datatype_t_TILEDB_STRING_UTF32 => Datatype::StringUtf32, + ffi::tiledb_datatype_t_TILEDB_STRING_UCS2 => Datatype::StringUcs2, + ffi::tiledb_datatype_t_TILEDB_STRING_UCS4 => Datatype::StringUcs4, + ffi::tiledb_datatype_t_TILEDB_ANY => Datatype::Any, + ffi::tiledb_datatype_t_TILEDB_DATETIME_YEAR => { + Datatype::DateTimeYear + } + ffi::tiledb_datatype_t_TILEDB_DATETIME_MONTH => { + Datatype::DateTimeMonth + } + ffi::tiledb_datatype_t_TILEDB_DATETIME_WEEK => { + Datatype::DateTimeWeek + } + ffi::tiledb_datatype_t_TILEDB_DATETIME_DAY => Datatype::DateTimeDay, + ffi::tiledb_datatype_t_TILEDB_DATETIME_HR => Datatype::DateTimeHour, + ffi::tiledb_datatype_t_TILEDB_DATETIME_MIN => { + Datatype::DateTimeMinute + } + ffi::tiledb_datatype_t_TILEDB_DATETIME_SEC => { + Datatype::DateTimeSecond + } + ffi::tiledb_datatype_t_TILEDB_DATETIME_MS => { + Datatype::DateTimeMillisecond + } + ffi::tiledb_datatype_t_TILEDB_DATETIME_US => { + Datatype::DateTimeMicrosecond + } + ffi::tiledb_datatype_t_TILEDB_DATETIME_NS => { + Datatype::DateTimeNanosecond + } + ffi::tiledb_datatype_t_TILEDB_DATETIME_PS => { + Datatype::DateTimePicosecond + } + ffi::tiledb_datatype_t_TILEDB_DATETIME_FS => { + Datatype::DateTimeFemtosecond + } + ffi::tiledb_datatype_t_TILEDB_DATETIME_AS => { + Datatype::DateTimeAttosecond + } + ffi::tiledb_datatype_t_TILEDB_TIME_HR => Datatype::TimeHour, + ffi::tiledb_datatype_t_TILEDB_TIME_MIN => Datatype::TimeMinute, + ffi::tiledb_datatype_t_TILEDB_TIME_SEC => Datatype::TimeSecond, + ffi::tiledb_datatype_t_TILEDB_TIME_MS => Datatype::TimeMillisecond, + ffi::tiledb_datatype_t_TILEDB_TIME_US => Datatype::TimeMicrosecond, + ffi::tiledb_datatype_t_TILEDB_TIME_NS => Datatype::TimeNanosecond, + ffi::tiledb_datatype_t_TILEDB_TIME_PS => Datatype::TimePicosecond, + ffi::tiledb_datatype_t_TILEDB_TIME_FS => Datatype::TimeFemtosecond, + ffi::tiledb_datatype_t_TILEDB_TIME_AS => Datatype::TimeAttosecond, + ffi::tiledb_datatype_t_TILEDB_BLOB => Datatype::Blob, + ffi::tiledb_datatype_t_TILEDB_BOOL => Datatype::Boolean, + ffi::tiledb_datatype_t_TILEDB_GEOM_WKB => Datatype::GeometryWkb, + ffi::tiledb_datatype_t_TILEDB_GEOM_WKT => Datatype::GeometryWkt, + _ => { + return Err(TryFromFFIError::InvalidDiscriminant(value as u64)) + } + }) + } +} + +#[derive(Clone, Debug, Error)] +pub enum TryFromFFIError { + #[error("Invalid discriminant for {}: {0}", std::any::type_name::())] + InvalidDiscriminant(u64), +} + +/// Apply a generic expression `$then` with a static type binding in the identifier `$typename` +/// for a logical type corresponding to the dynamic `$datatype`. +/// +/// This is similar to `physical_type_go!` but binds the logical type +/// instead of the physical type. +// note to developers: this is mimicking the C++ code +// template +// inline auto apply_with_type(Fn&& f, Datatype type, Args&&... args) +// +#[macro_export] +macro_rules! logical_type_go { + ($datatype:expr, $typename:ident, $then:expr) => {{ + type Datatype = $crate::datatype::Datatype; + match $datatype { + Datatype::Int8 => { + type $typename = $crate::datatype::logical::Int8Type; + $then + } + Datatype::Int16 => { + type $typename = $crate::datatype::logical::Int16Type; + $then + } + Datatype::Int32 => { + type $typename = $crate::datatype::logical::Int32Type; + $then + } + Datatype::Int64 => { + type $typename = $crate::datatype::logical::Int64Type; + $then + } + Datatype::UInt8 => { + type $typename = $crate::datatype::logical::UInt8Type; + $then + } + Datatype::UInt16 => { + type $typename = $crate::datatype::logical::UInt16Type; + $then + } + Datatype::UInt32 => { + type $typename = $crate::datatype::logical::UInt32Type; + $then + } + Datatype::UInt64 => { + type $typename = $crate::datatype::logical::UInt64Type; + $then + } + Datatype::Float32 => { + type $typename = $crate::datatype::logical::Float32Type; + $then + } + Datatype::Float64 => { + type $typename = $crate::datatype::logical::Float64Type; + $then + } + Datatype::Char => { + type $typename = $crate::datatype::logical::CharType; + $then + } + Datatype::StringAscii => { + type $typename = $crate::datatype::logical::StringAsciiType; + $then + } + Datatype::StringUtf8 => { + type $typename = $crate::datatype::logical::StringUtf8Type; + $then + } + Datatype::StringUtf16 => { + type $typename = $crate::datatype::logical::StringUtf16Type; + $then + } + Datatype::StringUtf32 => { + type $typename = $crate::datatype::logical::StringUtf32Type; + $then + } + Datatype::StringUcs2 => { + type $typename = $crate::datatype::logical::StringUcs2Type; + $then + } + Datatype::StringUcs4 => { + type $typename = $crate::datatype::logical::StringUcs4Type; + $then + } + Datatype::Any => { + type $typename = $crate::datatype::logical::AnyType; + $then + } + Datatype::DateTimeYear => { + type $typename = $crate::datatype::logical::DateTimeYearType; + $then + } + Datatype::DateTimeMonth => { + type $typename = $crate::datatype::logical::DateTimeMonthType; + $then + } + Datatype::DateTimeWeek => { + type $typename = $crate::datatype::logical::DateTimeWeekType; + $then + } + Datatype::DateTimeDay => { + type $typename = $crate::datatype::logical::DateTimeDayType; + $then + } + Datatype::DateTimeHour => { + type $typename = $crate::datatype::logical::DateTimeHourType; + $then + } + Datatype::DateTimeMinute => { + type $typename = $crate::datatype::logical::DateTimeMinuteType; + $then + } + Datatype::DateTimeSecond => { + type $typename = $crate::datatype::logical::DateTimeSecondType; + $then + } + Datatype::DateTimeMillisecond => { + type $typename = + $crate::datatype::logical::DateTimeMillisecondType; + $then + } + Datatype::DateTimeMicrosecond => { + type $typename = + $crate::datatype::logical::DateTimeMicrosecondType; + $then + } + Datatype::DateTimeNanosecond => { + type $typename = + $crate::datatype::logical::DateTimeNanosecondType; + $then + } + Datatype::DateTimePicosecond => { + type $typename = + $crate::datatype::logical::DateTimePicosecondType; + $then + } + Datatype::DateTimeFemtosecond => { + type $typename = + $crate::datatype::logical::DateTimeFemtosecondType; + $then + } + Datatype::DateTimeAttosecond => { + type $typename = + $crate::datatype::logical::DateTimeAttosecondType; + $then + } + Datatype::TimeHour => { + type $typename = $crate::datatype::logical::TimeHourType; + $then + } + Datatype::TimeMinute => { + type $typename = $crate::datatype::logical::TimeMinuteType; + $then + } + Datatype::TimeSecond => { + type $typename = $crate::datatype::logical::TimeSecondType; + $then + } + Datatype::TimeMillisecond => { + type $typename = $crate::datatype::logical::TimeMillisecondType; + $then + } + Datatype::TimeMicrosecond => { + type $typename = $crate::datatype::logical::TimeMicrosecondType; + $then + } + Datatype::TimeNanosecond => { + type $typename = $crate::datatype::logical::TimeNanosecondType; + $then + } + Datatype::TimePicosecond => { + type $typename = $crate::datatype::logical::TimePicosecondType; + $then + } + Datatype::TimeFemtosecond => { + type $typename = $crate::datatype::logical::TimeFemtosecondType; + $then + } + Datatype::TimeAttosecond => { + type $typename = $crate::datatype::logical::TimeAttosecondType; + $then + } + Datatype::Blob => { + type $typename = $crate::datatype::logical::BlobType; + $then + } + Datatype::Boolean => { + type $typename = $crate::datatype::logical::BooleanType; + $then + } + Datatype::GeometryWkb => { + type $typename = $crate::datatype::logical::GeometryWkbType; + $then + } + Datatype::GeometryWkt => { + type $typename = $crate::datatype::logical::GeometryWktType; + $then + } + } + }}; +} + +/// Apply a generic expression `$then` with a static type binding in the identifier `$typename` +/// for a physical type corresponding to the dynamic `$datatype`. +/// +/// This is similar to `logical_type_go!` but binds the physical type instead of logical +/// type which is useful for calling generic functions and methods with a `PhysicalType` +/// trait bound. +/// +/// # Examples +/// +/// ``` +/// use tiledb_common::physical_type_go; +/// use tiledb_common::datatype::Datatype; +/// +/// fn physical_type_to_str(datatype: Datatype) -> String { +/// physical_type_go!(datatype, DT, std::any::type_name::
().to_owned()) +/// } +/// +/// assert_eq!("u8", physical_type_to_str(Datatype::UInt8)); +/// assert_eq!("u8", physical_type_to_str(Datatype::StringAscii)); +/// assert_eq!("u64", physical_type_to_str(Datatype::UInt64)); +/// assert_eq!("i64", physical_type_to_str(Datatype::DateTimeMillisecond)); +/// ``` +#[macro_export] +macro_rules! physical_type_go { + ($datatype:expr, $typename:ident, $then:expr) => {{ + $crate::logical_type_go!($datatype, PhysicalTypeGoLogicalType, { + type $typename = ::PhysicalType; + $then + }) + }}; +} + +#[cfg(feature = "arrow")] +pub mod arrow; + +#[cfg(any(test, feature = "proptest-strategies"))] +pub mod strategy; + +#[cfg(test)] +mod tests { + use std::collections::HashSet; + + use proptest::prelude::*; + + use super::*; + + #[test] + fn datatype_roundtrips() { + for i in 0..256 { + let maybe_dt = Datatype::try_from(i); + if let Ok(dt) = maybe_dt { + assert_eq!(i, ffi::tiledb_datatype_t::from(dt)); + } + } + } + + #[test] + fn datatype_test() { + const NUM_DATATYPES: usize = DATATYPES.len() + 1; // for Datatype::Any + for i in 0..256 { + if i < NUM_DATATYPES { + let dt = Datatype::try_from(i as u32) + .expect("Error converting value to Datatype"); + assert_ne!( + format!("{}", dt), + "".to_string() + ); + assert!(check_valid(&dt)); + } else { + assert!(Datatype::try_from(i as u32).is_err()); + } + } + } + + #[test] + fn iter() { + let mut yielded = HashSet::::new(); + for dt in Datatype::iter() { + let prev = yielded.insert(dt); + assert!(prev); + } + } + + fn check_valid(dt: &Datatype) -> bool { + let mut count = 0; + + if dt.is_compatible_type::() { + count += 1; + } + + if dt.is_compatible_type::() { + count += 1; + } + + if dt.is_compatible_type::() { + count += 1; + } + + if dt.is_compatible_type::() { + count += 1; + } + + if dt.is_compatible_type::() { + count += 1; + } + + if dt.is_compatible_type::() { + count += 1; + } + + if dt.is_compatible_type::() { + count += 1; + } + + if dt.is_compatible_type::() { + count += 1; + } + + if dt.is_compatible_type::() { + count += 1; + } + + if dt.is_compatible_type::() { + count += 1; + } + + count == 1 + } + + #[cfg(feature = "option-subset")] + #[test] + fn option_subset() { + use tiledb_utils::{assert_not_option_subset, assert_option_subset}; + + assert_option_subset!(Datatype::Any, Datatype::Any); + assert_option_subset!(Datatype::Any, Datatype::UInt16); + assert_option_subset!(Datatype::Any, Datatype::UInt32); + assert_option_subset!(Datatype::UInt16, Datatype::UInt16); + assert_option_subset!(Datatype::UInt32, Datatype::UInt32); + assert_not_option_subset!(Datatype::UInt32, Datatype::Any); + assert_not_option_subset!(Datatype::UInt32, Datatype::UInt16); + assert_not_option_subset!(Datatype::UInt16, Datatype::Any); + assert_not_option_subset!(Datatype::UInt16, Datatype::UInt32); + } + + proptest! { + #[test] + fn logical_type(dt in any::()) { + logical_type_go!(dt, LT, { + let lt_constant = ::DATA_TYPE; + assert_eq!(dt, lt_constant); + + assert!(dt.is_compatible_type::<::PhysicalType>()); + }) + } + } + + #[test] + fn from_str() { + for datatype in Datatype::iter() { + let s_in = datatype.to_string(); + let s_out = Datatype::from_str(&s_in); + + assert_eq!(Ok(datatype), s_out); + } + } +} diff --git a/tiledb/api/src/datatype/physical.rs b/tiledb/common/src/datatype/physical.rs similarity index 88% rename from tiledb/api/src/datatype/physical.rs rename to tiledb/common/src/datatype/physical.rs index 6e8f6657..831faa72 100644 --- a/tiledb/api/src/datatype/physical.rs +++ b/tiledb/common/src/datatype/physical.rs @@ -2,9 +2,7 @@ use std::cmp::Ordering; use std::fmt::{Debug, Display, Formatter, Result as FmtResult}; use std::hash::{Hash, Hasher}; -use serde::{Deserialize, Serialize}; - -use crate::error::Error; +use crate::datatype::Error; use crate::private::sealed; /// Trait for comparisons based on value bits. @@ -206,19 +204,15 @@ pub trait PhysicalType: + Copy + Debug + Default - + for<'a> Deserialize<'a> + PartialEq + PartialOrd + Send - + Serialize + Sync + crate::private::Sealed + 'static { } -pub trait IntegralType: Eq + Ord + PhysicalType {} - macro_rules! integral_type_impls { ($($T:ty: $datatype:expr),+) => { sealed!($($T),+); @@ -243,8 +237,6 @@ macro_rules! integral_type_impls { } impl PhysicalType for $T {} - - impl IntegralType for $T {} )+ } } @@ -461,3 +453,48 @@ impl Display for PhysicalValue { } } } + +#[cfg(feature = "proptest-strategies")] +pub mod strategy { + use proptest::strategy::BoxedStrategy; + + pub enum PhysicalValueStrategy { + UInt8(BoxedStrategy), + UInt16(BoxedStrategy), + UInt32(BoxedStrategy), + UInt64(BoxedStrategy), + Int8(BoxedStrategy), + Int16(BoxedStrategy), + Int32(BoxedStrategy), + Int64(BoxedStrategy), + Float32(BoxedStrategy), + Float64(BoxedStrategy), + } + + macro_rules! field_value_strategy { + ($($variant:ident : $T:ty),+) => { + $( + impl From> for PhysicalValueStrategy { + fn from(value: BoxedStrategy<$T>) -> Self { + Self::$variant(value) + } + } + + impl TryFrom for BoxedStrategy<$T> { + type Error = (); + fn try_from(value: PhysicalValueStrategy) -> Result { + if let PhysicalValueStrategy::$variant(b) = value { + Ok(b) + } else { + Err(()) + } + } + } + )+ + } +} + + field_value_strategy!(UInt8 : u8, UInt16 : u16, UInt32 : u32, UInt64 : u64); + field_value_strategy!(Int8 : i8, Int16 : i16, Int32 : i32, Int64 : i64); + field_value_strategy!(Float32 : f32, Float64 : f64); +} diff --git a/tiledb/common/src/datatype/strategy.rs b/tiledb/common/src/datatype/strategy.rs new file mode 100644 index 00000000..11c9208b --- /dev/null +++ b/tiledb/common/src/datatype/strategy.rs @@ -0,0 +1,167 @@ +use proptest::prelude::*; + +use super::*; + +fn prop_datatype() -> impl Strategy { + prop_oneof![ + Just(Datatype::Int8), + Just(Datatype::Int16), + Just(Datatype::Int32), + Just(Datatype::Int64), + Just(Datatype::UInt8), + Just(Datatype::UInt16), + Just(Datatype::UInt32), + Just(Datatype::UInt64), + Just(Datatype::Float32), + Just(Datatype::Float64), + Just(Datatype::Char), + Just(Datatype::StringAscii), + Just(Datatype::StringUtf8), + Just(Datatype::StringUtf16), + Just(Datatype::StringUtf32), + Just(Datatype::StringUcs2), + Just(Datatype::StringUcs4), + Just(Datatype::Any), + Just(Datatype::DateTimeYear), + Just(Datatype::DateTimeMonth), + Just(Datatype::DateTimeWeek), + Just(Datatype::DateTimeDay), + Just(Datatype::DateTimeHour), + Just(Datatype::DateTimeMinute), + Just(Datatype::DateTimeSecond), + Just(Datatype::DateTimeMillisecond), + Just(Datatype::DateTimeMicrosecond), + Just(Datatype::DateTimeNanosecond), + Just(Datatype::DateTimePicosecond), + Just(Datatype::DateTimeFemtosecond), + Just(Datatype::DateTimeAttosecond), + Just(Datatype::TimeHour), + Just(Datatype::TimeMinute), + Just(Datatype::TimeSecond), + Just(Datatype::TimeMillisecond), + Just(Datatype::TimeMicrosecond), + Just(Datatype::TimeNanosecond), + Just(Datatype::TimePicosecond), + Just(Datatype::TimeFemtosecond), + Just(Datatype::TimeAttosecond), + Just(Datatype::Blob), + Just(Datatype::Boolean), + Just(Datatype::GeometryWkb), + Just(Datatype::GeometryWkt), + ] +} + +fn prop_datatype_for_dense_dimension() -> impl Strategy { + /* see `Datatype::is_allowed_dimension_type_dense` */ + proptest::strategy::Union::new( + DENSE_DIMENSION_DATATYPES.iter().map(|dt| Just(*dt)), + ) +} + +fn prop_datatype_for_sparse_dimension() -> impl Strategy { + /* see `Datatype::is_allowed_dimension_type_sparse` */ + proptest::strategy::Union::new( + SPARSE_DIMENSION_DATATYPES.iter().map(|dt| Just(*dt)), + ) +} + +const DELTA_FILTER_REINTERPRET_DATATYPES: [Datatype; 37] = [ + Datatype::Any, + Datatype::UInt8, + Datatype::UInt16, + Datatype::UInt32, + Datatype::UInt64, + Datatype::Int8, + Datatype::Int16, + Datatype::Int32, + Datatype::Int64, + Datatype::Float32, + Datatype::Float64, + Datatype::Boolean, + Datatype::Blob, + Datatype::GeometryWkb, + Datatype::GeometryWkt, + Datatype::DateTimeYear, + Datatype::DateTimeMonth, + Datatype::DateTimeWeek, + Datatype::DateTimeDay, + Datatype::DateTimeHour, + Datatype::DateTimeMinute, + Datatype::DateTimeSecond, + Datatype::DateTimeMillisecond, + Datatype::DateTimeMicrosecond, + Datatype::DateTimeNanosecond, + Datatype::DateTimePicosecond, + Datatype::DateTimeFemtosecond, + Datatype::DateTimeAttosecond, + Datatype::TimeHour, + Datatype::TimeMinute, + Datatype::TimeSecond, + Datatype::TimeMillisecond, + Datatype::TimeMicrosecond, + Datatype::TimeNanosecond, + Datatype::TimePicosecond, + Datatype::TimeFemtosecond, + Datatype::TimeAttosecond, +]; + +fn prop_datatype_for_delta_filter() -> impl Strategy { + // see core `FilterBuffer::buffers_as` + proptest::strategy::Union::new( + DELTA_FILTER_REINTERPRET_DATATYPES + .iter() + .map(|dt| Just(*dt)), + ) +} + +#[derive(Clone, Debug, Default)] +pub enum DatatypeContext { + #[default] + Any, + NotAny, + DenseDimension, + SparseDimension, + DeltaFilterReinterpretDatatype, + Fixed(Datatype), +} + +impl Arbitrary for Datatype { + type Parameters = DatatypeContext; + type Strategy = BoxedStrategy; + + fn arbitrary_with(p: Self::Parameters) -> Self::Strategy { + match p { + DatatypeContext::Any => prop_datatype().boxed(), + DatatypeContext::NotAny => prop_datatype() + .prop_filter("Datatype::Any", |dt| *dt != Datatype::Any) + .boxed(), + DatatypeContext::DenseDimension => { + prop_datatype_for_dense_dimension().boxed() + } + DatatypeContext::SparseDimension => { + prop_datatype_for_sparse_dimension().boxed() + } + DatatypeContext::DeltaFilterReinterpretDatatype => { + prop_datatype_for_delta_filter().boxed() + } + DatatypeContext::Fixed(dt) => Just(dt).boxed(), + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + proptest! { + #[test] + fn dense_dimension(dt in any_with::(DatatypeContext::DenseDimension)) { + assert!(dt.is_allowed_dimension_type_dense()) + } + + #[test] + fn sparse_dimension(dt in any_with::(DatatypeContext::SparseDimension)) { + assert!(dt.is_allowed_dimension_type_sparse()) + } + } +} diff --git a/tiledb/common/src/filter/mod.rs b/tiledb/common/src/filter/mod.rs new file mode 100644 index 00000000..fdf391b0 --- /dev/null +++ b/tiledb/common/src/filter/mod.rs @@ -0,0 +1,211 @@ +mod webp; + +use thiserror::Error; + +#[cfg(feature = "option-subset")] +use tiledb_utils::option::OptionSubset; + +#[cfg(feature = "serde")] +use serde::{Deserialize, Serialize}; + +use crate::datatype::Datatype; + +pub use self::webp::*; + +#[derive(Copy, Clone, Debug, Eq, PartialEq)] +#[cfg_attr(feature = "option-subset", derive(OptionSubset))] +#[cfg_attr(feature = "serde", derive(Deserialize, Serialize))] +pub enum CompressionType { + Bzip2, + Dictionary, + Gzip, + Lz4, + Rle, + Zstd, + Delta { + reinterpret_datatype: Option, + }, + DoubleDelta { + reinterpret_datatype: Option, + }, +} + +#[derive(Copy, Clone, Debug, Eq, PartialEq)] +#[cfg_attr(feature = "option-subset", derive(OptionSubset))] +#[cfg_attr(feature = "serde", derive(Deserialize, Serialize))] +pub enum ChecksumType { + Md5, + Sha256, +} + +#[derive(Clone, Debug, PartialEq)] +#[cfg_attr(feature = "option-subset", derive(OptionSubset))] +#[cfg_attr(feature = "serde", derive(Deserialize, Serialize))] +pub struct CompressionData { + pub kind: CompressionType, + pub level: Option, +} + +impl CompressionData { + pub fn new(kind: CompressionType) -> Self { + CompressionData { kind, level: None } + } +} + +#[derive(Clone, Copy, Debug, Default, PartialEq)] +#[cfg_attr(feature = "option-subset", derive(OptionSubset))] +#[cfg_attr(feature = "serde", derive(Deserialize, Serialize))] +pub enum ScaleFloatByteWidth { + I8, + I16, + I32, + #[default] // keep in sync with tiledb/sm/filter/float_scaling_filter.h + I64, +} + +#[derive(Clone, Debug, Error)] +pub enum ScaleFloatByteWidthError { + #[error("Invalid byte width: {0}")] + InvalidByteWidth(usize), +} + +impl ScaleFloatByteWidth { + pub fn output_datatype(&self) -> Datatype { + match *self { + Self::I8 => Datatype::Int8, + Self::I16 => Datatype::Int16, + Self::I32 => Datatype::Int32, + Self::I64 => Datatype::Int64, + } + } +} + +impl From for std::ffi::c_ulonglong { + fn from(value: ScaleFloatByteWidth) -> Self { + let c = match value { + ScaleFloatByteWidth::I8 => std::mem::size_of::(), + ScaleFloatByteWidth::I16 => std::mem::size_of::(), + ScaleFloatByteWidth::I32 => std::mem::size_of::(), + ScaleFloatByteWidth::I64 => std::mem::size_of::(), + }; + c as Self + } +} + +impl TryFrom for ScaleFloatByteWidth { + type Error = ScaleFloatByteWidthError; + fn try_from(value: std::ffi::c_ulonglong) -> Result { + match value { + 1 => Ok(Self::I8), + 2 => Ok(Self::I16), + 4 => Ok(Self::I32), + 8 => Ok(Self::I64), + v => Err(ScaleFloatByteWidthError::InvalidByteWidth(v as usize)), + } + } +} + +#[derive(Clone, Debug, PartialEq)] +#[cfg_attr(feature = "option-subset", derive(OptionSubset))] +#[cfg_attr(feature = "serde", derive(Deserialize, Serialize))] +pub enum FilterData { + None, + BitShuffle, + ByteShuffle, + BitWidthReduction { + max_window: Option, + }, + Checksum(ChecksumType), + Compression(CompressionData), + PositiveDelta { + max_window: Option, + }, + ScaleFloat { + byte_width: Option, + factor: Option, + offset: Option, + }, + WebP { + input_format: WebPFilterInputFormat, + lossless: Option, + quality: Option, + }, + Xor, +} + +impl FilterData { + /// Returns the output datatype when this filter is applied to the input type. + /// If the filter cannot accept the requested input type, None is returned. + pub fn transform_datatype(&self, input: &Datatype) -> Option { + /* + * Note to developers, this code should be kept in sync with + * tiledb/sm/filters/filter/ functions + * - `accepts_input_datatype` + * - `output_datatype` + * + * Those functions are not part of the external C API. + */ + match *self { + FilterData::None => Some(*input), + FilterData::BitShuffle => Some(*input), + FilterData::ByteShuffle => Some(*input), + FilterData::Checksum(_) => Some(*input), + FilterData::BitWidthReduction { .. } + | FilterData::PositiveDelta { .. } => { + if input.is_integral_type() + || input.is_datetime_type() + || input.is_time_type() + || input.is_byte_type() + { + Some(*input) + } else { + None + } + } + FilterData::Compression(CompressionData { kind, .. }) => match kind + { + CompressionType::Delta { + reinterpret_datatype, + } + | CompressionType::DoubleDelta { + reinterpret_datatype, + } => reinterpret_datatype.map_or(Some(*input), |dtype| { + if !dtype.is_real_type() { + Some(dtype) + } else { + None + } + }), + _ => Some(*input), + }, + FilterData::ScaleFloat { byte_width, .. } => { + let input_size = input.size(); + if input_size == std::mem::size_of::() + || input_size == std::mem::size_of::() + { + Some( + byte_width + .unwrap_or(ScaleFloatByteWidth::default()) + .output_datatype(), + ) + } else { + None + } + } + FilterData::WebP { .. } => { + if *input == Datatype::UInt8 { + Some(Datatype::UInt8) + } else { + None + } + } + FilterData::Xor => match input.size() { + 1 => Some(Datatype::Int8), + 2 => Some(Datatype::Int16), + 4 => Some(Datatype::Int32), + 8 => Some(Datatype::Int64), + _ => None, + }, + } + } +} diff --git a/tiledb/api/src/filter/webp.rs b/tiledb/common/src/filter/webp.rs similarity index 65% rename from tiledb/api/src/filter/webp.rs rename to tiledb/common/src/filter/webp.rs index 08f6f3e0..8dcc22af 100644 --- a/tiledb/api/src/filter/webp.rs +++ b/tiledb/common/src/filter/webp.rs @@ -1,11 +1,19 @@ +use thiserror::Error; + +#[cfg(feature = "serde")] use serde::{Deserialize, Serialize}; -use util::option::OptionSubset; +#[cfg(feature = "option-subset")] +use tiledb_utils::option::OptionSubset; -use crate::Result as TileDBResult; +#[derive(Clone, Debug, Error)] +pub enum WebPFilterError { + #[error("Invalid discriminant for {}: {0}", std::any::type_name::())] + InvalidDiscriminant(u64), +} -#[derive( - Copy, Clone, Debug, Deserialize, Eq, OptionSubset, PartialEq, Serialize, -)] +#[derive(Copy, Clone, Debug, Eq, PartialEq)] +#[cfg_attr(feature = "option-subset", derive(OptionSubset))] +#[cfg_attr(feature = "serde", derive(Deserialize, Serialize))] pub enum WebPFilterInputFormat { Rgb, Bgr, @@ -14,8 +22,17 @@ pub enum WebPFilterInputFormat { } impl WebPFilterInputFormat { - pub(crate) fn capi_enum(&self) -> ffi::tiledb_filter_webp_format_t { - let ffi_enum = match *self { + pub fn pixel_depth(&self) -> usize { + match *self { + WebPFilterInputFormat::Rgb | WebPFilterInputFormat::Bgr => 3, + WebPFilterInputFormat::Rgba | WebPFilterInputFormat::Bgra => 4, + } + } +} + +impl From for ffi::tiledb_filter_webp_format_t { + fn from(value: WebPFilterInputFormat) -> Self { + let ffi_enum = match value { WebPFilterInputFormat::Rgb => { ffi::tiledb_filter_webp_format_t_TILEDB_WEBP_RGB } @@ -31,18 +48,11 @@ impl WebPFilterInputFormat { }; ffi_enum as ffi::tiledb_filter_webp_format_t } - - pub fn pixel_depth(&self) -> usize { - match *self { - WebPFilterInputFormat::Rgb | WebPFilterInputFormat::Bgr => 3, - WebPFilterInputFormat::Rgba | WebPFilterInputFormat::Bgra => 4, - } - } } impl TryFrom for WebPFilterInputFormat { - type Error = crate::error::Error; - fn try_from(value: u32) -> TileDBResult { + type Error = WebPFilterError; + fn try_from(value: u32) -> Result { match value { ffi::tiledb_filter_webp_format_t_TILEDB_WEBP_RGB => { Ok(WebPFilterInputFormat::Rgb) @@ -56,10 +66,7 @@ impl TryFrom for WebPFilterInputFormat { ffi::tiledb_filter_webp_format_t_TILEDB_WEBP_BGRA => { Ok(WebPFilterInputFormat::Bgra) } - _ => Err(Self::Error::LibTileDB(format!( - "Invalid WebP filter format type: {}", - value - ))), + _ => Err(WebPFilterError::InvalidDiscriminant(value as u64)), } } } diff --git a/tiledb/api/src/key.rs b/tiledb/common/src/key.rs similarity index 100% rename from tiledb/api/src/key.rs rename to tiledb/common/src/key.rs diff --git a/tiledb/common/src/lib.rs b/tiledb/common/src/lib.rs new file mode 100644 index 00000000..1c6d1022 --- /dev/null +++ b/tiledb/common/src/lib.rs @@ -0,0 +1,33 @@ +#[cfg(feature = "option-subset")] +#[macro_use] +extern crate tiledb_proc_macro; +extern crate tiledb_sys as ffi; + +pub mod array; +pub mod datatype; +pub mod filter; +pub mod key; +pub mod metadata; +pub mod range; +pub mod vfs; + +mod private { + // The "sealed trait" pattern is a way to prevent downstream crates from implementing traits + // that you don't think they should implement. If you have `trait Foo: Sealed`, then + // downstream crates cannot `impl Foo` because they cannot `impl Sealed`. + // + // Semantic versioning is one reason you might want this. + // We currently use this as a bound for `datatype::PhysicalType` and `datatype::LogicalType` + // so that we won't accept something that we don't know about for the C API calls. + pub trait Sealed {} + + macro_rules! sealed { + ($($DT:ty),+) => { + $( + impl crate::private::Sealed for $DT {} + )+ + } + } + + pub(crate) use sealed; +} diff --git a/tiledb/common/src/metadata.rs b/tiledb/common/src/metadata.rs new file mode 100644 index 00000000..451f3dd9 --- /dev/null +++ b/tiledb/common/src/metadata.rs @@ -0,0 +1,289 @@ +use std::convert::From; + +use crate::datatype::Datatype; +use crate::datatype::Error as DatatypeError; + +#[cfg(feature = "option-subset")] +use tiledb_utils::option::OptionSubset; + +#[cfg(feature = "serde")] +use serde::{Deserialize, Serialize}; + +#[derive(Clone, Debug, PartialEq)] +#[cfg_attr(feature = "option-subset", derive(OptionSubset))] +#[cfg_attr(feature = "serde", derive(Deserialize, Serialize))] +pub enum Value { + UInt8Value(Vec), + UInt16Value(Vec), + UInt32Value(Vec), + UInt64Value(Vec), + Int8Value(Vec), + Int16Value(Vec), + Int32Value(Vec), + Int64Value(Vec), + Float32Value(Vec), + Float64Value(Vec), +} + +/// Applies a generic expression to the interior of a `Value`. +/// +/// # Examples +/// ``` +/// use tiledb_common::metadata::Value; +/// use tiledb_common::metadata_value_go; +/// +/// fn truncate(v: &mut Value, len: usize) { +/// metadata_value_go!(v, _DT, ref mut v_inner, v_inner.truncate(len)); +/// } +/// +/// let mut v = Value::UInt64Value(vec![0, 24, 48]); +/// truncate(&mut v, 2); +/// assert_eq!(v, Value::UInt64Value(vec![0, 24])); +/// ``` +#[macro_export] +macro_rules! metadata_value_go { + ($valuetype:expr, $typename:ident, $vec:pat, $then: expr) => {{ + use $crate::metadata::Value; + match $valuetype { + Value::Int8Value($vec) => { + type $typename = i8; + $then + } + Value::Int16Value($vec) => { + type $typename = i16; + $then + } + Value::Int32Value($vec) => { + type $typename = i32; + $then + } + Value::Int64Value($vec) => { + type $typename = i64; + $then + } + Value::UInt8Value($vec) => { + type $typename = u8; + $then + } + Value::UInt16Value($vec) => { + type $typename = u16; + $then + } + Value::UInt32Value($vec) => { + type $typename = u32; + $then + } + Value::UInt64Value($vec) => { + type $typename = u64; + $then + } + Value::Float32Value($vec) => { + type $typename = f32; + $then + } + Value::Float64Value($vec) => { + type $typename = f64; + $then + } + } + }}; +} +pub use metadata_value_go; + +/// Applies a generic expression to the interiors of two `Value`s with matching variants, +/// i.e. with the same physical data type. Typical usage is for comparing the insides of the two +/// `Value`s. +#[macro_export] +macro_rules! value_cmp { + ($lexpr:expr, $rexpr:expr, $typename:ident, $lpat:pat, $rpat:pat, $same_type:expr, $else:expr) => {{ + use $crate::metadata::Value; + match ($lexpr, $rexpr) { + (Value::Int8Value($lpat), Value::Int8Value($rpat)) => { + type $typename = i8; + $same_type + } + (Value::Int16Value($lpat), Value::Int16Value($rpat)) => { + type $typename = i16; + $same_type + } + (Value::Int32Value($lpat), Value::Int32Value($rpat)) => { + type $typename = i32; + $same_type + } + (Value::Int64Value($lpat), Value::Int64Value($rpat)) => { + type $typename = i64; + $same_type + } + (Value::UInt8Value($lpat), Value::UInt8Value($rpat)) => { + type $typename = u8; + $same_type + } + (Value::UInt16Value($lpat), Value::UInt16Value($rpat)) => { + type $typename = u16; + $same_type + } + (Value::UInt32Value($lpat), Value::UInt32Value($rpat)) => { + type $typename = u32; + $same_type + } + (Value::UInt64Value($lpat), Value::UInt64Value($rpat)) => { + type $typename = u64; + $same_type + } + (Value::Float32Value($lpat), Value::Float32Value($rpat)) => { + type $typename = f32; + $same_type + } + (Value::Float64Value($lpat), Value::Float64Value($rpat)) => { + type $typename = f64; + $same_type + } + _ => $else, + } + }}; +} + +impl Value { + pub fn len(&self) -> usize { + metadata_value_go!(self, _DT, ref v, v.len()) + } + + pub fn is_empty(&self) -> bool { + metadata_value_go!(self, _DT, ref v, v.is_empty()) + } +} + +macro_rules! metadata_value_impl { + ($ty:ty, $constructor:expr) => { + impl From> for Value { + fn from(vec: Vec<$ty>) -> Self { + $constructor(vec) + } + } + }; +} + +metadata_value_impl!(i8, Value::Int8Value); +metadata_value_impl!(i16, Value::Int16Value); +metadata_value_impl!(i32, Value::Int32Value); +metadata_value_impl!(i64, Value::Int64Value); +metadata_value_impl!(u8, Value::UInt8Value); +metadata_value_impl!(u16, Value::UInt16Value); +metadata_value_impl!(u32, Value::UInt32Value); +metadata_value_impl!(u64, Value::UInt64Value); +metadata_value_impl!(f32, Value::Float32Value); +metadata_value_impl!(f64, Value::Float64Value); + +#[derive(Clone, Debug, PartialEq)] +pub struct Metadata { + pub key: String, + pub datatype: Datatype, + pub value: Value, +} + +impl Metadata { + pub fn new( + key: String, + datatype: Datatype, + vec: Vec, + ) -> Result + where + Value: From>, + T: 'static, + { + if !datatype.is_compatible_type::() { + return Err(DatatypeError::physical_type_incompatible::( + datatype, + )); + } + Ok(Metadata { + key, + datatype, + value: Value::from(vec), + }) + } +} + +#[cfg(any(test, feature = "proptest-strategies"))] +pub mod strategy { + use super::*; + use proptest::collection::{vec, SizeRange}; + use proptest::prelude::*; + + use crate::datatype::strategy::DatatypeContext; + use crate::physical_type_go; + + pub struct Requirements { + key: BoxedStrategy, + datatype: BoxedStrategy, + value_length: SizeRange, + } + + impl Requirements { + const DEFAULT_VALUE_LENGTH_MIN: usize = 1; // SC-48955 + const DEFAULT_VALUE_LENGTH_MAX: usize = 64; + } + + impl Default for Requirements { + fn default() -> Self { + Requirements { + key: any::().boxed(), + datatype: any_with::(DatatypeContext::NotAny).boxed(), + value_length: (Self::DEFAULT_VALUE_LENGTH_MIN + ..=Self::DEFAULT_VALUE_LENGTH_MAX) + .into(), + } + } + } + + impl Arbitrary for Metadata { + type Parameters = Requirements; + type Strategy = BoxedStrategy; + + fn arbitrary_with(params: Self::Parameters) -> Self::Strategy { + params + .datatype + .prop_flat_map(move |dt| { + let value_strat = physical_type_go!(dt, DT, { + vec(any::
(), params.value_length.clone()) + .prop_map(Value::from) + .boxed() + }); + (params.key.clone(), Just(dt), value_strat) + }) + .prop_map(|(key, datatype, value)| Metadata { + key, + datatype, + value, + }) + .boxed() + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + use proptest::prelude::*; + + fn do_value_cmp(m1: Metadata, m2: Metadata) { + if m1.datatype.same_physical_type(&m2.datatype) { + value_cmp!(&m1.value, &m2.value, _DT, _, _, + (), + unreachable!("Non-matching `Value` variants for same physical type: {:?} and {:?}", + m1, m2)); + } else { + value_cmp!(&m1.value, &m2.value, _DT, _, _, + unreachable!("Matching `Value` variants for different physical type: {:?} and {:?}", + m1, m2), + ()); + } + } + + proptest! { + #[test] + fn value_cmp((m1, m2) in (any::(), any::())) { + do_value_cmp(m1, m2) + } + } +} diff --git a/tiledb/api/src/range.rs b/tiledb/common/src/range.rs similarity index 79% rename from tiledb/api/src/range.rs rename to tiledb/common/src/range.rs index c5dca513..651445e3 100644 --- a/tiledb/api/src/range.rs +++ b/tiledb/common/src/range.rs @@ -1,30 +1,73 @@ -use std::fmt::{Debug, Formatter, Result as FmtResult}; use std::hash::{Hash, Hasher}; use std::num::NonZeroU32; use std::ops::Deref; -use anyhow::anyhow; +use thiserror::Error; + +#[cfg(feature = "serde")] use serde::{Deserialize, Serialize}; -use serde_json::json; use crate::array::CellValNum; use crate::datatype::physical::{BitsEq, BitsHash, BitsOrd}; -use crate::datatype::Datatype; -use crate::error::{DatatypeErrorKind, Error}; +use crate::datatype::{Datatype, Error as DatatypeError}; use crate::physical_type_go; -use crate::Result as TileDBResult; pub type MinimumBoundingRectangle = Vec; +#[derive(Clone, Debug, Eq, Error, PartialEq)] +pub enum DimensionCompatibilityError { + #[error("Dimensions cannot have a multiple-value fixed ranges: found range of size {0}")] + MultiValueRange(usize), + #[error("{:?} is invalid for dimensions", CellValNum::Fixed(*.0))] + CellValNumFixed(NonZeroU32), + #[error("Dimension of type {} cannot have {:?}", Datatype::StringAscii.to_string(), CellValNum::Fixed(*.0))] + FixedStringAsciiDimension(NonZeroU32), + #[error("Dimension of type {0} cannot have variable-length range")] + VarRangeForNonStringDimension(Datatype), + #[error("Dimension of type {} cannot have a fixed-length range", Datatype::StringAscii.to_string())] + FixedRangeForStringDimension, + #[error("Dimension of type {0} cannot have {:?}", CellValNum::Var)] + CellValNumVar(Datatype), + #[error("Datatype error: {0}")] + Datatype(#[from] DatatypeError), +} + +#[derive(Clone, Debug, Error)] +pub enum RangeFromSlicesError { + #[error("Start range truncation of datatype {0}: expected multiple of {} bytes but found {1}", .0.size())] + StartTruncation(Datatype, usize), + #[error("End range truncation of datatype {0}: expected multiple of {} bytes but found {1}", .0.size())] + EndTruncation(Datatype, usize), + + #[error("Start range invalid number of values: expected {0}, found {1}")] + StartMultiValueRangeMismatch(NonZeroU32, usize), + #[error("End range invalid number of values: expected {0}, found {1}")] + EndMultiValueRangeMismatch(NonZeroU32, usize), + #[error("Invalid multi-value range: {0}")] + InvalidMultiValueRange(#[from] MultiValueRangeError), +} + +#[derive(Clone, Debug, Error)] +pub enum MultiValueRangeError { + #[error("Expected multiple value cells but found {:?}; use SingleValueRange instead", CellValNum::single())] + CellValNumSingle, + #[error("Expected fixed-length {} but found {:?}", std::any::type_name::(), CellValNum::Var)] + CellValNumVar, + #[error("Invalid start range: expected range of length {0} but found {1}")] + InvalidStartRange(NonZeroU32, usize), + #[error("Invalid end range: expected range of length {0} but found {1}")] + InvalidEndRange(NonZeroU32, usize), +} + macro_rules! check_datatype_inner { - ($ty:ty, $dtype:expr) => { - if !$dtype.is_compatible_type::<$ty>() { - return Err(Error::Datatype(DatatypeErrorKind::TypeMismatch { - user_type: std::any::type_name::<$ty>().to_owned(), - tiledb_type: $dtype, - })); + ($ty:ty, $dtype:expr) => {{ + let datatype = $dtype; + if !datatype.is_compatible_type::<$ty>() { + return Err(DatatypeError::physical_type_incompatible::<$ty>( + datatype, + )); } - }; + }}; } macro_rules! check_datatype { @@ -79,7 +122,8 @@ where Some((lower, upper)) } -#[derive(Clone, Debug, Deserialize, Serialize)] +#[derive(Clone, Debug)] +#[cfg_attr(feature = "serde", derive(Deserialize, Serialize))] pub enum SingleValueRange { UInt8(u8, u8), UInt16(u16, u16), @@ -97,7 +141,7 @@ impl SingleValueRange { /// Returns the number of cells spanned by this range if it is a /// range over a discrete domain. /// ``` - /// use tiledb::range::SingleValueRange; + /// use tiledb_common::range::SingleValueRange; /// assert_eq!(Some(100), SingleValueRange::Int64(1, 100).num_cells()); /// assert_eq!(None, SingleValueRange::Float64(1.0, 100.0).num_cells()); /// ``` @@ -128,7 +172,10 @@ impl SingleValueRange { ) } - pub fn check_datatype(&self, datatype: Datatype) -> TileDBResult<()> { + pub fn check_datatype( + &self, + datatype: Datatype, + ) -> Result<(), DatatypeError> { check_datatype!(self, datatype); Ok(()) } @@ -511,7 +558,8 @@ impl TryFrom for std::ops::RangeInclusive { } } -#[derive(Clone, Debug, Deserialize, Serialize)] +#[derive(Clone, Debug)] +#[cfg_attr(feature = "serde", derive(Deserialize, Serialize))] pub enum MultiValueRange { UInt8(Box<[u8]>, Box<[u8]>), UInt16(Box<[u16]>, Box<[u16]>), @@ -526,7 +574,10 @@ pub enum MultiValueRange { } impl MultiValueRange { - pub fn check_datatype(&self, datatype: Datatype) -> TileDBResult<()> { + pub fn check_datatype( + &self, + datatype: Datatype, + ) -> Result<(), DatatypeError> { check_datatype!(self, datatype); Ok(()) } @@ -542,7 +593,8 @@ impl MultiValueRange { /// If the lower and upper bounds differ only in the last value, /// then the result is the discrete difference between the last values. /// ``` - /// use tiledb::{array::CellValNum, range::MultiValueRange}; + /// use tiledb_common::array::CellValNum; + /// use tiledb_common::range::MultiValueRange; /// /// let cvn = CellValNum::try_from(2).unwrap(); /// assert_eq!(Some(100), @@ -557,7 +609,7 @@ impl MultiValueRange { /// then all possible values of the trailing values represent unique /// cells in the range. /// ``` - /// use tiledb::range::MultiValueRange; + /// use tiledb_common::range::MultiValueRange; /// let num_i32s = ((i32::MAX as i128 - i32::MIN as i128) + 1) as u128; /// let num_i64s = ((i64::MAX as i128 - i64::MIN as i128) + 1) as u128; /// assert_eq!(Some(num_i32s + 1), @@ -701,35 +753,23 @@ macro_rules! multi_value_range_try_from { ($($V:ident : $U:ty),+) => { $( impl TryFrom<(CellValNum, Box<[$U]>, Box<[$U]>)> for MultiValueRange { - type Error = crate::error::Error; + type Error = MultiValueRangeError; fn try_from(value: (CellValNum, Box<[$U]>, Box<[$U]>)) -> - TileDBResult { + Result { let cell_val_num = match value.0 { CellValNum::Fixed(cvn) if u32::from(cvn) == 1u32 => { - return Err(Error::InvalidArgument(anyhow!( - "MultiValueRange does not support CellValNum::Fixed(1)" - ))); + return Err(MultiValueRangeError::CellValNumSingle) } - CellValNum::Fixed(cvn) => cvn.get(), + CellValNum::Fixed(cvn) => cvn, CellValNum::Var => { - return Err(Error::InvalidArgument(anyhow!( - "MultiValueRange does not support CellValNum::Var" - ))); + return Err(MultiValueRangeError::CellValNumVar) } }; - if value.1.len() as u32 != cell_val_num { - return Err(Error::InvalidArgument(anyhow!( - "Invalid range start length. Found {}, not {}", - value.1.len(), - cell_val_num - ))) + if value.1.len() as u32 != cell_val_num.get() { + return Err(MultiValueRangeError::InvalidStartRange(cell_val_num, value.1.len())) } - if value.2.len() as u32 != cell_val_num { - return Err(Error::InvalidArgument(anyhow!( - "Invalid range end length. Found {}, not {}", - value.2.len(), - cell_val_num - ))) + if value.2.len() as u32 != cell_val_num.get() { + return Err(MultiValueRangeError::InvalidEndRange(cell_val_num, value.2.len())) } Ok(MultiValueRange::$V(value.1, value.2)) } @@ -737,7 +777,7 @@ macro_rules! multi_value_range_try_from { impl TryFrom<(CellValNum, Vec<$U>, Vec<$U>)> for MultiValueRange { type Error = , Box<[$U]>)>>::Error; - fn try_from(value: (CellValNum, Vec<$U>, Vec<$U>)) -> TileDBResult { + fn try_from(value: (CellValNum, Vec<$U>, Vec<$U>)) -> Result { let (cell_val_num, lb, ub) = value; Self::try_from((cell_val_num, lb.into_boxed_slice(), ub.into_boxed_slice())) } @@ -957,7 +997,8 @@ macro_rules! multi_value_range_cmp { }}; } -#[derive(Clone, Debug, Deserialize, Serialize)] +#[derive(Clone, Debug)] +#[cfg_attr(feature = "serde", derive(Deserialize, Serialize))] pub enum VarValueRange { UInt8(Box<[u8]>, Box<[u8]>), UInt16(Box<[u16]>, Box<[u16]>), @@ -977,7 +1018,10 @@ impl VarValueRange { CellValNum::Var } - pub fn check_datatype(&self, datatype: Datatype) -> TileDBResult<()> { + pub fn check_datatype( + &self, + datatype: Datatype, + ) -> Result<(), DatatypeError> { check_datatype!(self, datatype); Ok(()) } @@ -1290,7 +1334,8 @@ macro_rules! var_value_range_cmp { }}; } -#[derive(Clone, Deserialize, Serialize, Eq, Hash, PartialEq)] +#[derive(Clone, Debug, Eq, Hash, PartialEq)] +#[cfg_attr(feature = "serde", derive(Deserialize, Serialize))] pub enum Range { Single(SingleValueRange), Multi(MultiValueRange), @@ -1325,13 +1370,13 @@ impl Range { &self, datatype: Datatype, cell_val_num: CellValNum, - ) -> TileDBResult<()> { + ) -> Result<(), DimensionCompatibilityError> { match self { Self::Single(svr) => svr.check_datatype(datatype)?, - Self::Multi(_) => { - return Err(Error::InvalidArgument(anyhow!( - "Dimensions can not have a fixed cell val num > 1" - ))); + Self::Multi(mvr) => { + return Err(DimensionCompatibilityError::MultiValueRange( + mvr.num_values(), + )) } Self::Var(vvr) => vvr.check_datatype(datatype)?, } @@ -1339,34 +1384,52 @@ impl Range { match cell_val_num { CellValNum::Fixed(cvn) => { if cvn.get() > 1 { - return Err(Error::InvalidArgument(anyhow!( - "Invalid cell val number: {}", - cvn.get() - ))); + return Err(DimensionCompatibilityError::CellValNumFixed( + cvn, + )); } if datatype == Datatype::StringAscii { - return Err(Error::InvalidArgument(anyhow!( - "StringAscii dimensions must be var sized." - ))); + return Err( + DimensionCompatibilityError::FixedStringAsciiDimension( + cvn, + ), + ); } if !matches!(self, Self::Single(_)) { - return Err(Error::InvalidArgument(anyhow!( - "Non-string dimensions must have a cell val num of 1." - ))); + return Err(DimensionCompatibilityError::VarRangeForNonStringDimension(datatype)); } } CellValNum::Var => { if datatype != Datatype::StringAscii { - return Err(Error::InvalidArgument(anyhow!( - "Dimensions of type {} must have a cell val num of 1", - datatype - ))); - } - if !matches!(self, Self::Var(VarValueRange::UInt8(_, _))) { - return Err(Error::InvalidArgument(anyhow!( - "String dimensions must use VarValueRange::UInt8" - ))); + return Err(DimensionCompatibilityError::CellValNumVar( + datatype, + )); } + match self { + Range::Single(SingleValueRange::UInt8(_, _)) => + Err(DimensionCompatibilityError::FixedRangeForStringDimension), + Range::Multi(MultiValueRange::UInt8(_, _)) => + Err(DimensionCompatibilityError::FixedRangeForStringDimension), + Range::Var(VarValueRange::UInt8(_, _)) => Ok(()), + Range::Single(s) => single_value_range_go!(s, DT, _, _, + Err(DimensionCompatibilityError::Datatype( + DatatypeError::physical_type_incompatible::
(datatype)))), + Range::Multi(m) => { + // NB: this is actually unreachable but this is what it would be if it were + multi_value_range_go!(m, DT, _, _, + Err(DimensionCompatibilityError::Datatype( + DatatypeError::physical_type_incompatible::
(datatype)))) + }, + Range::Var(v) => var_value_range_go!(v, DT, _, _, + Err( + DimensionCompatibilityError::Datatype( + DatatypeError::physical_type_incompatible::
( + datatype, + ), + ), + ) + ), + }? } } @@ -1405,12 +1468,6 @@ impl Range { } } -impl Debug for Range { - fn fmt(&self, f: &mut Formatter) -> FmtResult { - write!(f, "{}", json!(self)) - } -} - macro_rules! range_from_impl { ($($V:ident : $U:ty),+) => { $( @@ -1421,8 +1478,8 @@ macro_rules! range_from_impl { } impl TryFrom<(CellValNum, Box<[$U]>, Box<[$U]>)> for Range { - type Error = crate::error::Error; - fn try_from(value: (CellValNum, Box<[$U]>, Box<[$U]>)) -> TileDBResult { + type Error = , Box<[$U]>)>>::Error; + fn try_from(value: (CellValNum, Box<[$U]>, Box<[$U]>)) -> Result { Ok(Range::Multi(MultiValueRange::try_from(value)?)) } } @@ -1470,7 +1527,8 @@ impl From for Range { } } -#[derive(Clone, Deserialize, PartialEq, Serialize)] +#[derive(Clone, Debug, PartialEq)] +#[cfg_attr(feature = "serde", derive(Deserialize, Serialize))] pub struct TypedRange { pub datatype: Datatype, pub range: Range, @@ -1490,37 +1548,53 @@ impl TypedRange { cell_val_num: CellValNum, start: &[u8], end: &[u8], - ) -> TileDBResult { + ) -> Result { match cell_val_num { CellValNum::Var => { - if start.len() as u64 % datatype.size() != 0 { - return Err(Error::InvalidArgument(anyhow!( - "Invalid start length not a multiple of {:?}", - datatype.size() - ))); + if start.len() % datatype.size() != 0 { + return Err(RangeFromSlicesError::StartTruncation( + datatype, + start.len(), + )); } - if end.len() as u64 % datatype.size() != 0 { - return Err(Error::InvalidArgument(anyhow!( - "Invalid end length not a multiple of {:?}", - datatype.size() - ))); + if end.len() % datatype.size() != 0 { + return Err(RangeFromSlicesError::EndTruncation( + datatype, + start.len(), + )); } } CellValNum::Fixed(cvn) => { - let expected_len = datatype.size() * cvn.get() as u64; - if start.len() as u64 != expected_len { - return Err(Error::InvalidArgument(anyhow!( - "Invalid start length is {}, not {}", + if start.len() % datatype.size() != 0 { + return Err(RangeFromSlicesError::StartTruncation( + datatype, start.len(), - expected_len - ))); - } - if end.len() as u64 != expected_len { - return Err(Error::InvalidArgument(anyhow!( - "Invalid end length is {}, not {}", + )); + } else if end.len() % datatype.size() != 0 { + return Err(RangeFromSlicesError::EndTruncation( + datatype, start.len(), - expected_len - ))); + )); + } + + let num_elements_start = start.len() / datatype.size(); + if num_elements_start != cvn.get() as usize { + return Err( + RangeFromSlicesError::StartMultiValueRangeMismatch( + cvn, + num_elements_start, + ), + ); + } + + let num_elements_end = end.len() / datatype.size(); + if num_elements_end != cvn.get() as usize { + return Err( + RangeFromSlicesError::EndMultiValueRangeMismatch( + cvn, + num_elements_end, + ), + ); } } } @@ -1529,14 +1603,14 @@ impl TypedRange { let start_slice = unsafe { std::slice::from_raw_parts( start.as_ptr() as *const DT, - start.len() / datatype.size() as usize, + start.len() / datatype.size(), ) }; let start = start_slice.to_vec().into_boxed_slice(); let end_slice = unsafe { std::slice::from_raw_parts( end.as_ptr() as *const DT, - end.len() / datatype.size() as usize, + end.len() / datatype.size(), ) }; let end = end_slice.to_vec().into_boxed_slice(); @@ -1561,12 +1635,6 @@ impl TypedRange { } } -impl Debug for TypedRange { - fn fmt(&self, f: &mut Formatter) -> FmtResult { - write!(f, "{}", json!(self)) - } -} - #[derive(Clone, Debug, PartialEq)] pub struct NonEmptyDomain(Vec); @@ -1690,7 +1758,7 @@ pub mod strategy { .map(|dt| Just(dt).boxed()) .unwrap_or(any::().boxed()); let strat_nz = params.1.map(|nz| Just(nz).boxed()).unwrap_or( - (1..1024u32) + (2..1024u32) .prop_map(|nz| NonZeroU32::new(nz).unwrap()) .boxed(), ); @@ -1775,21 +1843,24 @@ pub mod strategy { #[cfg(test)] mod tests { use std::cmp::Ordering; + use std::fmt::Debug; - use super::*; - use crate::Result as TileDBResult; - use proptest::collection::vec; use proptest::prelude::*; - fn test_clone(range: &Range) { - let other = range.clone(); - assert_eq!(*range, other); + use super::*; + + fn test_clone(value: &T) + where + T: Clone + Debug + PartialEq, + { + let other = value.clone(); + assert_eq!(*value, other); } fn test_dimension_compatibility( range: &Range, datatype: Datatype, - ) -> TileDBResult<()> { + ) -> anyhow::Result<()> { match range { Range::Single(srange) => { if !matches!(datatype, Datatype::StringAscii) { @@ -1836,6 +1907,7 @@ mod tests { Ok(()) } + #[cfg(feature = "serde")] fn test_serialization_roundtrip(range: &Range) { let data = serde_json::to_string(range).unwrap(); let other: Range = serde_json::from_str(&data).unwrap(); @@ -1856,139 +1928,201 @@ mod tests { assert_eq!(*range, range2.range); } - // physical_type_go! seems to be fairly heavy for using with llvm-cov so I've - // minimized the number of usages in these tests by adding test helpers - // that are called from as few physical_type_go macros as possible. - #[test] - fn test_single_value_range() { - for datatype in Datatype::iter() { - physical_type_go!(datatype, DT, { - proptest!(ProptestConfig::with_cases(8), - |(start in any::
(), end in any::
())| { - - let range = Range::from(&[start, end]); - test_clone(&range); - test_dimension_compatibility(&range, datatype)?; - test_serialization_roundtrip(&range); - - let start_slice = start.to_le_bytes(); - let end_slice = end.to_le_bytes(); - test_from_slices( - &range, - datatype, - CellValNum::try_from(1)?, - &start_slice[..], - &end_slice[..] - ); - }); - }); + proptest! { + #[test] + fn single_value_range((datatype, range) in any::().prop_flat_map(|dt| + ( + Just(dt), + any_with::(Some(dt)) + ) + )) + { + do_single_value_range(datatype, range).unwrap() + } + + #[test] + fn multi_value_range((datatype, range) in any::().prop_flat_map(|dt| + ( + Just(dt), + any_with::((Some(dt), None)) + ) + )) + { + do_multi_value_range(datatype, range).unwrap() + } + + #[test] + fn var_value_range((datatype, range) in any::().prop_flat_map(|dt| + ( + Just(dt), + any_with::(Some(dt)) + ) + )) + { + do_var_value_range(datatype, range).unwrap() } } - #[test] - fn test_multi_value_range() { - for datatype in Datatype::iter() { - physical_type_go!(datatype, DT, { - proptest!(ProptestConfig::with_cases(8), - |(data in vec(any::
(), 2..=32))| { - let len = data.len() as u32; - let cell_val_num = CellValNum::try_from(len)?; - let start = data.clone().into_boxed_slice(); - let end = start.clone(); - - let range = Range::try_from( - (cell_val_num, start.clone(), end.clone()))?; - test_clone(&range); - test_dimension_compatibility(&range, datatype)?; - test_serialization_roundtrip(&range); - - let nbytes = (len as u64 * datatype.size()) as usize; - let start = data.clone().into_boxed_slice(); - let end = data.clone().into_boxed_slice(); - - let start_slice = unsafe { - std::slice::from_raw_parts( - start.as_ptr() as *mut u8 as *const u8, - nbytes, - ) - }; + fn do_single_value_range( + datatype: Datatype, + range: SingleValueRange, + ) -> anyhow::Result<()> { + test_clone(&range); - let end_slice = unsafe { - std::slice::from_raw_parts( - end.as_ptr() as *mut u8 as *const u8, - nbytes, - ) - }; + let rr = Range::Single(range.clone()); + test_dimension_compatibility(&rr, datatype)?; - test_from_slices( - &range, - datatype, - CellValNum::try_from(len)?, - start_slice, - end_slice - ); + #[cfg(feature = "serde")] + test_serialization_roundtrip(&rr); - // Check TryFrom failures - assert!(Range::try_from( - (CellValNum::try_from(1)?, start.clone(), end.clone())).is_err()); - assert!(Range::try_from( - (CellValNum::Var, start.clone(), end.clone())).is_err()); - - let start = data.clone().into_boxed_slice(); - let mut end = data.clone(); - end.push(data[0]); - let end = end.into_boxed_slice(); - assert!(Range::try_from((cell_val_num, start, end)).is_err()); - - let mut start = data.clone(); - start.push(data[0]); - let start = start.into_boxed_slice(); - let end = data.clone().into_boxed_slice(); - assert!(Range::try_from((cell_val_num, start, end)).is_err()); - }); + let (start_slice, end_slice) = + single_value_range_go!(range, _DT, ref start, ref end, { + (start.to_le_bytes().to_vec(), end.to_le_bytes().to_vec()) }); - } + test_from_slices( + &rr, + datatype, + CellValNum::try_from(1)?, + &start_slice[..], + &end_slice[..], + ); + Ok(()) } - #[test] - fn test_var_value_range() { - for datatype in Datatype::iter() { - physical_type_go!(datatype, DT, { - proptest!(ProptestConfig::with_cases(8), - |(start in vec(any::
(), 0..=32), end in vec(any::
(), 0..=32))| { - let start = start.into_boxed_slice(); - let end = end.into_boxed_slice(); - - let range = Range::from((start.clone(), end.clone())); - test_clone(&range); - test_dimension_compatibility(&range, datatype)?; - test_serialization_roundtrip(&range); - - // Test from slices - let start_slice = unsafe { - std::slice::from_raw_parts( - start.as_ptr() as *mut u8 as *const u8, - std::mem::size_of_val(&*start), - ) - }; + fn do_multi_value_range( + datatype: Datatype, + range: MultiValueRange, + ) -> anyhow::Result<()> { + test_clone(&range); - let end_slice = unsafe { - std::slice::from_raw_parts( - end.as_ptr() as *mut u8 as *const u8, - std::mem::size_of_val(&*end), - ) - }; + let rr = Range::Multi(range.clone()); + test_dimension_compatibility(&rr, datatype)?; - test_from_slices( - &range, - datatype, - CellValNum::Var, - start_slice, - end_slice - ); - }); + #[cfg(feature = "serde")] + test_serialization_roundtrip(&rr); + + let CellValNum::Fixed(cell_val_num) = range.cell_val_num() else { + unreachable!() + }; + + let (start_slice, end_slice) = + multi_value_range_go!(range, _DT, ref start, ref end, { + assert_eq!(start.len(), end.len()); + assert_eq!(cell_val_num.get() as usize, start.len()); + + let nbytes = std::mem::size_of_val(start.as_ref()); + + let start_slice = unsafe { + std::slice::from_raw_parts( + start.as_ptr() as *mut u8 as *const u8, + nbytes, + ) + }; + let end_slice = unsafe { + std::slice::from_raw_parts( + end.as_ptr() as *mut u8 as *const u8, + nbytes, + ) + }; + (start_slice, end_slice) }); - } + + test_from_slices( + &rr, + datatype, + CellValNum::Fixed(cell_val_num), + start_slice, + end_slice, + ); + + // Check TryFrom failures + multi_value_range_go!(range, _DT, ref start, ref end, { + assert!(Range::try_from(( + CellValNum::try_from(1)?, + start.clone(), + end.clone() + )) + .is_err()); + assert!(Range::try_from(( + CellValNum::Var, + start.clone(), + end.clone() + )) + .is_err()); + + { + let start = start.clone(); + let mut end = end.clone().into_vec(); + end.push(end[0]); + let end = end.into_boxed_slice(); + assert!(Range::try_from(( + CellValNum::Fixed(cell_val_num), + start, + end + )) + .is_err()); + } + + { + let mut start = start.clone().into_vec(); + start.push(start[0]); + let start = start.into_boxed_slice(); + let end = end.clone(); + assert!(Range::try_from(( + CellValNum::Fixed(cell_val_num), + start, + end + )) + .is_err()); + } + }); + Ok(()) + } + + fn do_var_value_range( + datatype: Datatype, + range: VarValueRange, + ) -> anyhow::Result<()> { + test_clone(&range); + + let rr = Range::Var(range.clone()); + test_dimension_compatibility(&rr, datatype)?; + + #[cfg(feature = "serde")] + test_serialization_roundtrip(&rr); + + let (start_slice, end_slice) = var_value_range_go!( + range, + DT, + ref start, + ref end, + #[allow(clippy::unnecessary_cast)] + { + let to_byte_slice = |s: &[DT]| unsafe { + std::slice::from_raw_parts( + if s.is_empty() { + std::ptr::NonNull::
::dangling().as_ptr() + as *mut u8 + } else { + s.as_ptr() as *mut u8 + } as *const u8, + std::mem::size_of_val(s), + ) + }; + let start_slice = to_byte_slice(start); + let end_slice = to_byte_slice(end); + (start_slice, end_slice) + } + ); + + test_from_slices( + &rr, + datatype, + CellValNum::Var, + start_slice, + end_slice, + ); + Ok(()) } #[test] @@ -2016,15 +2150,94 @@ mod tests { ) .is_err()); - let range = Range::from(&[0u8, 1u8]); - assert!(range + let _ = format!("{:?}", range); + } + + #[test] + fn dimension_compatibility_string_ascii_var() { + // single + assert_eq!( + Err(DimensionCompatibilityError::Datatype( + DatatypeError::PhysicalTypeIncompatible { + physical_type: "u16", + logical_type: Datatype::StringAscii + } + )), + Range::from(&[0u16, 1u16]).check_dimension_compatibility( + Datatype::StringAscii, + CellValNum::Var + ) + ); + assert_eq!( + Err(DimensionCompatibilityError::FixedRangeForStringDimension), + Range::from(&[0u8, 1u8]).check_dimension_compatibility( + Datatype::StringAscii, + CellValNum::Var + ) + ); + + // multi + assert_eq!( + Err(DimensionCompatibilityError::MultiValueRange(2)), + Range::Multi( + MultiValueRange::try_from(( + CellValNum::try_from(2).unwrap(), + vec![1u16, 10u16].into_boxed_slice(), + vec![10u16, 1u16].into_boxed_slice() + )) + .unwrap() + ) .check_dimension_compatibility( Datatype::StringAscii, CellValNum::Var ) - .is_err()); + ); + assert_eq!( + Err(DimensionCompatibilityError::MultiValueRange(2)), + Range::Multi( + MultiValueRange::try_from(( + CellValNum::try_from(2).unwrap(), + vec![1u8, 10u8].into_boxed_slice(), + vec![10u8, 1u8].into_boxed_slice() + )) + .unwrap() + ) + .check_dimension_compatibility( + Datatype::StringAscii, + CellValNum::Var + ) + ); - let _ = format!("{:?}", range); + // var but not u8 + assert_eq!( + Err(DimensionCompatibilityError::Datatype( + DatatypeError::PhysicalTypeIncompatible { + physical_type: "u16", + logical_type: Datatype::StringAscii + } + )), + Range::Var(VarValueRange::from(( + vec![1u16, 10u16].into_boxed_slice(), + vec![10u16, 1u16].into_boxed_slice() + ))) + .check_dimension_compatibility( + Datatype::StringAscii, + CellValNum::Var + ) + ); + + // var u8 + assert_eq!( + Ok(()), + Range::Var(VarValueRange::from(( + vec![1u8, 10u8].into_boxed_slice(), + vec![10u8, 1u8].into_boxed_slice() + ))) + .check_dimension_compatibility( + Datatype::StringAscii, + CellValNum::Var + ) + ); } #[test] diff --git a/tiledb/common/src/vfs.rs b/tiledb/common/src/vfs.rs new file mode 100644 index 00000000..b783f57c --- /dev/null +++ b/tiledb/common/src/vfs.rs @@ -0,0 +1,40 @@ +use thiserror::Error; + +#[cfg(feature = "serde")] +use serde::{Deserialize, Serialize}; + +#[derive(Clone, Debug, Eq, Error, PartialEq)] +pub enum VFSModeError { + #[error("Invalid discriminant for {}: {0}", std::any::type_name::())] + InvalidDiscriminant(u64), +} + +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +#[cfg_attr(feature = "serde", derive(Deserialize, Serialize))] +pub enum VFSMode { + Read, + Write, + Append, +} + +impl From for ffi::tiledb_vfs_mode_t { + fn from(value: VFSMode) -> Self { + match value { + VFSMode::Read => ffi::tiledb_vfs_mode_t_TILEDB_VFS_READ, + VFSMode::Write => ffi::tiledb_vfs_mode_t_TILEDB_VFS_WRITE, + VFSMode::Append => ffi::tiledb_vfs_mode_t_TILEDB_VFS_APPEND, + } + } +} + +impl TryFrom for VFSMode { + type Error = VFSModeError; + fn try_from(value: ffi::tiledb_vfs_mode_t) -> Result { + match value { + ffi::tiledb_vfs_mode_t_TILEDB_VFS_READ => Ok(VFSMode::Read), + ffi::tiledb_vfs_mode_t_TILEDB_VFS_WRITE => Ok(VFSMode::Write), + ffi::tiledb_vfs_mode_t_TILEDB_VFS_APPEND => Ok(VFSMode::Append), + _ => Err(VFSModeError::InvalidDiscriminant(value as u64)), + } + } +} diff --git a/tiledb/pod/Cargo.toml b/tiledb/pod/Cargo.toml new file mode 100644 index 00000000..eeee2957 --- /dev/null +++ b/tiledb/pod/Cargo.toml @@ -0,0 +1,33 @@ +[package] +name = "tiledb-pod" +edition.workspace = true +rust-version.workspace = true +version.workspace = true + +[dependencies] +itertools = { workspace = true } +num-traits = { workspace = true, optional = true } +proptest = { workspace = true, optional = true } +serde = { workspace = true, optional = true } +serde_json = { workspace = true, optional = true } +strategy-ext = { workspace = true, optional = true } +thiserror = { workspace = true } +tiledb-common = { workspace = true } +tiledb-proc-macro = { workspace = true, optional = true } +tiledb-proptest-config = { workspace = true, optional = true } +tiledb-utils = { workspace = true, optional = true } +tiledb-sys = { workspace = true } + +[dev-dependencies] +num-traits = { workspace = true } +proptest = { workspace = true } +tiledb-common = { workspace = true, features = ["proptest-strategies"] } +tiledb-proptest-config = { workspace = true } +strategy-ext = { workspace = true } +tiledb-utils = { workspace = true } + +[features] +default = [] +option-subset = ["dep:tiledb-proc-macro", "dep:tiledb-utils", "tiledb-common/option-subset"] +proptest-strategies = ["dep:num-traits", "dep:proptest", "dep:strategy-ext", "dep:tiledb-proptest-config", "dep:tiledb-utils", "tiledb-common/proptest-strategies"] +serde = ["dep:serde", "dep:serde_json", "tiledb-common/serde"] diff --git a/tiledb/pod/src/array/attribute/mod.rs b/tiledb/pod/src/array/attribute/mod.rs new file mode 100644 index 00000000..01f13f81 --- /dev/null +++ b/tiledb/pod/src/array/attribute/mod.rs @@ -0,0 +1,34 @@ +#[cfg(feature = "option-subset")] +use tiledb_utils::option::OptionSubset; + +#[cfg(feature = "serde")] +use serde::{Deserialize, Serialize}; + +use tiledb_common::array::CellValNum; +use tiledb_common::datatype::Datatype; +use tiledb_common::filter::FilterData; +use tiledb_common::metadata::Value as MetadataValue; + +#[derive(Clone, Default, Debug, PartialEq)] +#[cfg_attr(feature = "option-subset", derive(OptionSubset))] +#[cfg_attr(feature = "serde", derive(Deserialize, Serialize))] +pub struct AttributeData { + pub name: String, + pub datatype: Datatype, + pub nullability: Option, + pub cell_val_num: Option, + pub fill: Option, + pub filters: Vec, +} + +/// Encapsulation of data needed to construct an Attribute's fill value +#[derive(Clone, Debug, PartialEq)] +#[cfg_attr(feature = "option-subset", derive(OptionSubset))] +#[cfg_attr(feature = "serde", derive(Deserialize, Serialize))] +pub struct FillData { + pub data: MetadataValue, + pub nullability: Option, +} + +#[cfg(any(test, feature = "proptest-strategies"))] +pub mod strategy; diff --git a/tiledb/api/src/array/attribute/strategy.rs b/tiledb/pod/src/array/attribute/strategy.rs similarity index 73% rename from tiledb/api/src/array/attribute/strategy.rs rename to tiledb/pod/src/array/attribute/strategy.rs index 5b082fb7..dfb46b23 100644 --- a/tiledb/api/src/array/attribute/strategy.rs +++ b/tiledb/pod/src/array/attribute/strategy.rs @@ -2,16 +2,59 @@ use std::rc::Rc; use proptest::prelude::*; use proptest::strategy::ValueTree; -use tiledb_test_utils::strategy::StrategyExt; +use strategy_ext::StrategyExt; +use tiledb_common::array::{ArrayType, CellValNum}; +use tiledb_common::datatype::physical::strategy::PhysicalValueStrategy; +use tiledb_common::datatype::Datatype; +use tiledb_common::filter::FilterData; +use tiledb_common::physical_type_go; -use crate::array::{ - attribute::FillData, ArrayType, AttributeData, CellValNum, DomainData, -}; -use crate::filter::list::FilterListData; +use crate::array::attribute::{AttributeData, FillData}; +use crate::array::domain::DomainData; use crate::filter::strategy::{ - FilterPipelineValueTree, Requirements as FilterRequirements, + FilterPipelineStrategy, FilterPipelineValueTree, + Requirements as FilterRequirements, }; -use crate::{physical_type_go, Datatype}; + +impl AttributeData { + /// Returns a strategy for generating values of this attribute's type. + pub fn value_strategy(&self) -> PhysicalValueStrategy { + use proptest::prelude::*; + use tiledb_common::filter::{ + CompressionData, CompressionType, FilterData, + }; + use tiledb_common::physical_type_go; + + let has_double_delta = self.filters.iter().any(|f| { + matches!( + f, + FilterData::Compression(CompressionData { + kind: CompressionType::DoubleDelta { .. }, + .. + }) + ) + }); + + physical_type_go!(self.datatype, DT, { + if has_double_delta { + if std::any::TypeId::of::
() == std::any::TypeId::of::() + { + // see core `DoubleDelta::compute_bitsize` + let min = 0u64; + let max = u64::MAX >> 1; + return PhysicalValueStrategy::from((min..=max).boxed()); + } else if std::any::TypeId::of::
() + == std::any::TypeId::of::() + { + let min = i64::MIN >> 2; + let max = i64::MAX >> 2; + return PhysicalValueStrategy::from((min..=max).boxed()); + } + } + PhysicalValueStrategy::from(any::
().boxed()) + }) + } +} #[derive(Clone)] pub enum StrategyContext { @@ -56,7 +99,7 @@ fn prop_filters( datatype: Datatype, cell_val_num: CellValNum, requirements: Rc, -) -> impl Strategy { +) -> impl Strategy> { use crate::filter::strategy::StrategyContext as FilterContext; let pipeline_requirements = FilterRequirements { @@ -82,7 +125,7 @@ fn prop_filters( .unwrap_or_default() }; - any_with::(Rc::new(pipeline_requirements)) + FilterPipelineStrategy::new(Rc::new(pipeline_requirements)) } /// Returns a strategy for generating an arbitrary Attribute of the given datatype @@ -159,6 +202,15 @@ pub fn prop_attribute( .boxed() } +impl Arbitrary for AttributeData { + type Parameters = Option>; + type Strategy = BoxedStrategy; + + fn arbitrary_with(params: Self::Parameters) -> Self::Strategy { + prop_attribute(params.unwrap_or_default()).boxed() + } +} + #[derive(Clone, Debug)] pub struct AttributeValueTree { name: String, @@ -204,35 +256,3 @@ impl ValueTree for AttributeValueTree { self.filters.complicate() } } - -#[cfg(test)] -mod tests { - use super::*; - use crate::{Context, Factory}; - use util::assert_option_subset; - use util::option::OptionSubset; - - /// Test that the arbitrary attribute construction always succeeds - #[test] - fn attribute_arbitrary() { - let ctx = Context::new().expect("Error creating context"); - - proptest!(|(attr in prop_attribute(Default::default()))| { - attr.create(&ctx).expect("Error constructing arbitrary attribute"); - }); - } - - #[test] - fn attribute_eq_reflexivity() { - let ctx = Context::new().expect("Error creating context"); - - proptest!(|(attr in prop_attribute(Default::default()))| { - assert_eq!(attr, attr); - assert_option_subset!(attr, attr); - - let attr = attr.create(&ctx) - .expect("Error constructing arbitrary attribute"); - assert_eq!(attr, attr); - }); - } -} diff --git a/tiledb/pod/src/array/dimension/mod.rs b/tiledb/pod/src/array/dimension/mod.rs new file mode 100644 index 00000000..c89dd0b4 --- /dev/null +++ b/tiledb/pod/src/array/dimension/mod.rs @@ -0,0 +1,34 @@ +#[cfg(feature = "option-subset")] +use tiledb_utils::option::OptionSubset; + +#[cfg(any(test, feature = "proptest-strategies"))] +pub mod strategy; + +#[cfg(feature = "serde")] +use serde::{Deserialize, Serialize}; + +use tiledb_common::array::dimension::DimensionConstraints; +use tiledb_common::array::CellValNum; +use tiledb_common::datatype::Datatype; +use tiledb_common::filter::FilterData; + +/// Encapsulation of data needed to construct a Dimension +#[derive(Clone, Debug, PartialEq)] +#[cfg_attr(feature = "option-subset", derive(OptionSubset))] +#[cfg_attr(feature = "serde", derive(Deserialize, Serialize))] +pub struct DimensionData { + pub name: String, + pub datatype: Datatype, + pub constraints: DimensionConstraints, + + /// Optional filters to apply to the dimension. If None or Some(empty), + /// then filters will be inherited from the schema's `coordinate_filters` + /// field when the array is constructed. + pub filters: Option>, +} + +impl DimensionData { + pub fn cell_val_num(&self) -> CellValNum { + self.constraints.cell_val_num() + } +} diff --git a/tiledb/api/src/array/dimension/strategy.rs b/tiledb/pod/src/array/dimension/strategy.rs similarity index 53% rename from tiledb/api/src/array/dimension/strategy.rs rename to tiledb/pod/src/array/dimension/strategy.rs index fc139136..92c69603 100644 --- a/tiledb/api/src/array/dimension/strategy.rs +++ b/tiledb/pod/src/array/dimension/strategy.rs @@ -5,21 +5,138 @@ use num_traits::{Bounded, FromPrimitive, Num}; use proptest::prelude::*; use proptest::strategy::ValueTree; -use tiledb_test_utils::strategy::StrategyExt; +use strategy_ext::StrategyExt; +use tiledb_common::array::ArrayType; +use tiledb_common::datatype::physical::strategy::PhysicalValueStrategy; +use tiledb_common::datatype::physical::BitsOrd; +use tiledb_common::datatype::strategy::*; +use tiledb_common::datatype::Datatype; +use tiledb_common::physical_type_go; use tiledb_utils::numbers::{ NextDirection, NextNumericValue, SmallestPositiveValue, }; use crate::array::dimension::DimensionConstraints; -use crate::array::{ArrayType, DimensionData}; -use crate::datatype::physical::BitsOrd; -use crate::datatype::strategy::*; -use crate::filter::list::FilterListData; +use crate::array::dimension::DimensionData; use crate::filter::strategy::{ - FilterPipelineValueTree, Requirements as FilterRequirements, - StrategyContext as FilterContext, + FilterPipelineStrategy, FilterPipelineValueTree, + Requirements as FilterRequirements, StrategyContext as FilterContext, }; -use crate::{physical_type_go, Datatype}; + +impl DimensionData { + /// Returns a strategy for generating values of this dimension's type + /// which fall within the domain of this dimension. + pub fn value_strategy(&self) -> PhysicalValueStrategy { + use proptest::prelude::*; + use tiledb_common::dimension_constraints_go; + + dimension_constraints_go!( + self.constraints, + DT, + ref domain, + _, + PhysicalValueStrategy::from((domain[0]..=domain[1]).boxed()), + { + assert_eq!(self.datatype, Datatype::StringAscii); + PhysicalValueStrategy::from(any::().boxed()) + } + ) + } + + /// Returns a strategy for generating subarray ranges which fall within + /// the domain of this dimension. + /// + /// `cell_bound` is an optional restriction on the number of possible values + /// which the strategy is allowed to return. + /// + /// If `cell_bound` is `None`, then this function always returns `Some`. + pub fn subarray_strategy( + &self, + cell_bound: Option, + ) -> Option> + { + use proptest::prelude::Just; + use proptest::strategy::Strategy; + use tiledb_common::dimension_constraints_go; + use tiledb_common::range::{Range, SingleValueRange, VarValueRange}; + + dimension_constraints_go!( + self.constraints, + DT, + ref domain, + _, + { + let cell_bound = cell_bound + .map(|bound| DT::try_from(bound).unwrap_or(DT::MAX)) + .unwrap_or(DT::MAX); + + let domain_lower = domain[0]; + let domain_upper = domain[1]; + let strat = + (domain_lower..=domain_upper).prop_flat_map(move |lb| { + let ub = std::cmp::min( + domain_upper, + lb.checked_add(cell_bound).unwrap_or(DT::MAX), + ); + (Just(lb), lb..=ub).prop_map(|(min, max)| { + Range::Single(SingleValueRange::from(&[min, max])) + }) + }); + Some(strat.boxed()) + }, + { + if cell_bound.is_some() { + /* + * This can be implemented, but there's some ambiguity about + * what it should mean when precision goes out the window, + * so wait until there's a use case to decide. + */ + return None; + } + + let domain_lower = domain[0]; + let domain_upper = domain[1]; + let strat = + (domain_lower..=domain_upper).prop_flat_map(move |lb| { + (Just(lb), (lb..=domain_upper)).prop_map( + |(min, max)| { + Range::Single(SingleValueRange::from(&[ + min, max, + ])) + }, + ) + }); + Some(strat.boxed()) + }, + { + // DimensionConstraints::StringAscii + let strat_bound = + proptest::string::string_regex("[ -~]*").unwrap().boxed(); + + if cell_bound.is_some() { + /* + * This is not tractible unless there is a bound on the string length. + * There isn't one since `StringAscii` is only allowed as a dimension + * type in sparse arrays. + */ + return None; + } + + let strat = (strat_bound.clone(), strat_bound).prop_map( + |(ascii1, ascii2)| { + let (lb, ub) = if ascii1 < ascii2 { + (ascii1, ascii2) + } else { + (ascii2, ascii1) + }; + Range::Var(VarValueRange::from((lb, ub))) + }, + ); + Some(strat.boxed()) + } + ) + } +} #[derive(Clone)] pub struct Requirements { @@ -213,7 +330,7 @@ fn prop_dimension_for_datatype( .map(|rc| rc.as_ref().clone()) .unwrap_or_default() }; - let filters = any_with::(Rc::new(filter_req)); + let filters = FilterPipelineStrategy::new(Rc::new(filter_req)); (prop_dimension_name(), Just(constraints), filters) .prop_map(move |(name, constraints, filters)| DimensionData { name, @@ -315,33 +432,94 @@ impl ValueTree for DimensionValueTree { #[cfg(test)] mod tests { - use super::*; - use crate::{Context, Factory}; - use util::assert_option_subset; - use util::option::OptionSubset; + use std::rc::Rc; - /// Test that the arbitrary dimension construction always succeeds - #[test] - fn test_prop_dimension() { - let ctx = Context::new().expect("Error creating context"); + use proptest::prelude::*; + use proptest::strategy::Strategy; + use tiledb_common::range::{Range, SingleValueRange}; - proptest!(|(maybe_dimension in any::())| { - maybe_dimension.create(&ctx) - .expect("Error constructing arbitrary dimension"); - }); - } + use super::Requirements; + use super::*; #[test] - fn dimension_eq_reflexivity() { - let ctx = Context::new().expect("Error creating context"); - - proptest!(|(dimension in any::())| { - assert_eq!(dimension, dimension); - assert_option_subset!(dimension, dimension); + fn subarray_strategy_dense() { + let req = Requirements { + array_type: Some(ArrayType::Dense), + ..Default::default() + }; + let strat = ( + any_with::(req), + prop_oneof![Just(None), any::().prop_map(Some)], + ) + .prop_flat_map(|(d, cell_bound)| { + let subarray_strat = d + .subarray_strategy(cell_bound) + .expect("Dense dimension must have a subarray strategy"); + (Just(Rc::new(d)), Just(cell_bound), subarray_strat) + }); - let dimension = dimension - .create(&ctx).expect("Error constructing arbitrary attribute"); - assert_eq!(dimension, dimension); + proptest!(|((d, cell_bound, s) in strat)| { + if let Some(bound) = cell_bound { + assert!(s.num_cells().unwrap() <= bound as u128); + } + if let Some(num_cells) = d.constraints.num_cells() { + assert!(s.num_cells().unwrap() <= num_cells); + } + let Range::Single(s) = s else { + unreachable!("Unexpected range for dense dimension: {:?}", s) + }; + let (start, end) = match s { + SingleValueRange::Int8(start, end) => { + let DimensionConstraints::Int8([lb, ub], _) = d.constraints else { unreachable!() }; + assert!(lb <= start); + assert!(end <= ub); + (start as i128, end as i128) + } + SingleValueRange::Int16(start, end) => { + let DimensionConstraints::Int16([lb, ub], _) = d.constraints else { unreachable!() }; + assert!(lb <= start); + assert!(end <= ub); + (start as i128, end as i128) + } + SingleValueRange::Int32(start, end) => { + let DimensionConstraints::Int32([lb, ub], _) = d.constraints else { unreachable!() }; + assert!(lb <= start); + assert!(end <= ub); + (start as i128, end as i128) + } + SingleValueRange::Int64(start, end) => { + let DimensionConstraints::Int64([lb, ub], _) = d.constraints else { unreachable!() }; + assert!(lb <= start); + assert!(end <= ub); + (start as i128, end as i128) + } + SingleValueRange::UInt8(start, end) => { + let DimensionConstraints::UInt8([lb, ub], _) = d.constraints else { unreachable!() }; + assert!(lb <= start); + assert!(end <= ub); + (start as i128, end as i128) + } + SingleValueRange::UInt16(start, end) => { + let DimensionConstraints::UInt16([lb, ub], _) = d.constraints else { unreachable!() }; + assert!(lb <= start); + assert!(end <= ub); + (start as i128, end as i128) + } + SingleValueRange::UInt32(start, end) => { + let DimensionConstraints::UInt32([lb, ub], _) = d.constraints else { unreachable!() }; + assert!(lb <= start); + assert!(end <= ub); + (start as i128, end as i128) + } + SingleValueRange::UInt64(start, end) => { + let DimensionConstraints::UInt64([lb, ub], _) = d.constraints else { unreachable!() }; + assert!(lb <= start); + assert!(end <= ub); + (start as i128, end as i128) + }, + s => unreachable!("Unexpected range type for dense dimension: {:?}", s) + }; + assert_eq!(Some((end - start + 1) as u128), s.num_cells()); }); } } diff --git a/tiledb/pod/src/array/domain/mod.rs b/tiledb/pod/src/array/domain/mod.rs new file mode 100644 index 00000000..394bf8a3 --- /dev/null +++ b/tiledb/pod/src/array/domain/mod.rs @@ -0,0 +1,54 @@ +#[cfg(any(test, feature = "proptest-strategies"))] +pub mod strategy; + +#[cfg(feature = "option-subset")] +use tiledb_utils::option::OptionSubset; + +#[cfg(feature = "serde")] +use serde::{Deserialize, Serialize}; + +use tiledb_common::range::{NonEmptyDomain, Range}; + +use crate::array::dimension::DimensionData; + +/// Encapsulation of data needed to construct a Domain +#[derive(Clone, Default, Debug, PartialEq)] +#[cfg_attr(feature = "option-subset", derive(OptionSubset))] +#[cfg_attr(feature = "serde", derive(Deserialize, Serialize))] +pub struct DomainData { + pub dimension: Vec, +} + +impl DomainData { + /// Returns the total number of cells spanned by all dimensions, + /// or `None` if: + /// - any dimension is not constrained into a domain; or + /// - the total number of cells exceeds `usize::MAX`. + pub fn num_cells(&self) -> Option { + let mut total = 1u128; + for d in self.dimension.iter() { + total = total.checked_mul(d.constraints.num_cells()?)?; + } + usize::try_from(total).ok() + } + + /// Returns the number of cells in each tile, or `None` if: + /// - any dimension does not have a tile extent specified (e.g. for a sparse array); or + /// - the number of cells in a tile exceeds `usize::MAX`. + pub fn num_cells_per_tile(&self) -> Option { + let mut total = 1usize; + for d in self.dimension.iter() { + total = total.checked_mul(d.constraints.num_cells_per_tile()?)?; + } + Some(total) + } + + /// Returns the domains of each dimension as a `NonEmptyDomain`, + /// or `None` if any dimension is not constrained into a domain + pub fn domains(&self) -> Option { + self.dimension + .iter() + .map(|d| d.constraints.domain().map(Range::Single)) + .collect::>() + } +} diff --git a/tiledb/api/src/array/domain/strategy.rs b/tiledb/pod/src/array/domain/strategy.rs similarity index 81% rename from tiledb/api/src/array/domain/strategy.rs rename to tiledb/pod/src/array/domain/strategy.rs index 7664ce1b..27c4cfc9 100644 --- a/tiledb/api/src/array/domain/strategy.rs +++ b/tiledb/pod/src/array/domain/strategy.rs @@ -3,15 +3,29 @@ use std::rc::Rc; use proptest::prelude::*; use proptest::sample::select; use proptest::strategy::ValueTree; -use tiledb_test_utils::strategy::records::RecordsValueTree; -use tiledb_test_utils::strategy::StrategyExt; +use strategy_ext::records::RecordsValueTree; +use strategy_ext::StrategyExt; +use tiledb_common::array::ArrayType; +use tiledb_common::datatype::strategy::*; +use tiledb_common::datatype::Datatype; +use tiledb_common::range::Range; use crate::array::dimension::strategy::{ DimensionValueTree, Requirements as DimensionRequirements, }; -use crate::array::{ArrayType, DimensionData, DomainData}; -use crate::datatype::strategy::*; -use crate::Datatype; +use crate::array::dimension::DimensionData; +use crate::array::domain::DomainData; + +impl DomainData { + pub fn subarray_strategy( + &self, + ) -> impl proptest::prelude::Strategy> { + self.dimension + .iter() + .map(|d| d.subarray_strategy(None).unwrap()) + .collect::>>() + } +} #[derive(Clone)] pub struct Requirements { @@ -23,19 +37,20 @@ pub struct Requirements { impl Requirements { pub fn env_max_dimensions() -> Option { - crate::strategy::config::TILEDB_STRATEGY_DOMAIN_PARAMETERS_DIMENSIONS_MAX.environmental() + tiledb_proptest_config::TILEDB_STRATEGY_DOMAIN_PARAMETERS_DIMENSIONS_MAX + .environmental() } pub fn min_dimensions_default() -> usize { - **crate::strategy::config::TILEDB_STRATEGY_DOMAIN_PARAMETERS_DIMENSIONS_MIN + **tiledb_proptest_config::TILEDB_STRATEGY_DOMAIN_PARAMETERS_DIMENSIONS_MIN } pub fn max_dimensions_default() -> usize { - **crate::strategy::config::TILEDB_STRATEGY_DOMAIN_PARAMETERS_DIMENSIONS_MAX + **tiledb_proptest_config::TILEDB_STRATEGY_DOMAIN_PARAMETERS_DIMENSIONS_MAX } pub fn cells_per_tile_limit_default() -> usize { - **crate::strategy::config::TILEDB_STRATEGY_DOMAIN_PARAMETERS_CELLS_PER_TILE_LIMIT + **tiledb_proptest_config::TILEDB_STRATEGY_DOMAIN_PARAMETERS_CELLS_PER_TILE_LIMIT } } @@ -192,36 +207,8 @@ impl ValueTree for DomainValueTree { #[cfg(test)] mod tests { use super::*; - use crate::{Context, Factory}; use proptest::strategy::ValueTree; - use util::option::OptionSubset; - - /// Test that the arbitrary domain construction always succeeds - #[test] - fn domain_arbitrary() { - let ctx = Context::new().expect("Error creating context"); - - proptest!(|(maybe_domain in any::())| { - maybe_domain.create(&ctx) - .expect("Error constructing arbitrary domain"); - }); - } - - #[test] - fn domain_eq_reflexivity() { - let ctx = Context::new().expect("Error creating context"); - - proptest!(|(domain in any::())| { - assert_eq!(domain, domain); - assert!(domain.option_subset(&domain)); - - let domain = domain.create(&ctx) - .expect("Error constructing arbitrary domain"); - assert_eq!(domain, domain); - }); - } - #[ignore = "takes a long time due to shrink iters on name, we should do custom shrink strategy"] #[test] fn domain_shrinking() { let strat = any::(); diff --git a/tiledb/pod/src/array/enumeration/mod.rs b/tiledb/pod/src/array/enumeration/mod.rs new file mode 100644 index 00000000..fab341fb --- /dev/null +++ b/tiledb/pod/src/array/enumeration/mod.rs @@ -0,0 +1,23 @@ +#[cfg(any(test, feature = "proptest-strategies"))] +pub mod strategy; + +#[cfg(feature = "option-subset")] +use tiledb_utils::option::OptionSubset; + +#[cfg(feature = "serde")] +use serde::{Deserialize, Serialize}; + +use tiledb_common::datatype::Datatype; + +/// Encapsulation of data needed to construct an Enumeration +#[derive(Clone, Debug, PartialEq)] +#[cfg_attr(feature = "option-subset", derive(OptionSubset))] +#[cfg_attr(feature = "serde", derive(Deserialize, Serialize))] +pub struct EnumerationData { + pub name: String, + pub datatype: Datatype, + pub cell_val_num: Option, + pub ordered: Option, + pub data: Box<[u8]>, + pub offsets: Option>, +} diff --git a/tiledb/api/src/array/enumeration/strategy.rs b/tiledb/pod/src/array/enumeration/strategy.rs similarity index 72% rename from tiledb/api/src/array/enumeration/strategy.rs rename to tiledb/pod/src/array/enumeration/strategy.rs index e55db654..f051b506 100644 --- a/tiledb/api/src/array/enumeration/strategy.rs +++ b/tiledb/pod/src/array/enumeration/strategy.rs @@ -2,10 +2,10 @@ use std::cmp::Ordering; use proptest::collection::vec; use proptest::prelude::*; +use tiledb_common::datatype::{Datatype, PhysicalType}; +use tiledb_common::physical_type_go; -use crate::array::EnumerationData; -use crate::datatype::PhysicalType; -use crate::{physical_type_go, Datatype}; +use crate::array::enumeration::EnumerationData; pub fn prop_enumeration_name() -> impl Strategy { proptest::string::string_regex("[a-zA-Z0-9_]+") @@ -76,11 +76,11 @@ pub struct Parameters { impl Parameters { fn min_variants_default() -> usize { - **crate::strategy::config::TILEDB_STRATEGY_ENUMERATION_PARAMETERS_NUM_VARIANTS_MIN + **tiledb_proptest_config::TILEDB_STRATEGY_ENUMERATION_PARAMETERS_NUM_VARIANTS_MIN } fn max_variants_default() -> usize { - **crate::strategy::config::TILEDB_STRATEGY_ENUMERATION_PARAMETERS_NUM_VARIANTS_MAX + **tiledb_proptest_config::TILEDB_STRATEGY_ENUMERATION_PARAMETERS_NUM_VARIANTS_MAX } } @@ -111,30 +111,3 @@ impl Arbitrary for EnumerationData { .boxed() } } - -#[cfg(test)] -mod tests { - use super::*; - use crate::{Context, Factory}; - - /// Test that the arbitrary enumeration construction always succeeds - #[test] - fn enumeration_arbitrary() { - let ctx = Context::new().expect("Error creating context"); - - proptest!(|(enmr in any::())| { - enmr.create(&ctx).expect("Error constructing arbitrary enumeration"); - }); - } - - #[test] - fn enumeration_eq_reflexivity() { - let ctx = Context::new().expect("Error creating context"); - - proptest!(|(enmr in any::())| { - let enmr = enmr.create(&ctx) - .expect("Error constructing arbitrary enumeration"); - assert_eq!(enmr, enmr); - }); - } -} diff --git a/tiledb/pod/src/array/mod.rs b/tiledb/pod/src/array/mod.rs new file mode 100644 index 00000000..7c42eebd --- /dev/null +++ b/tiledb/pod/src/array/mod.rs @@ -0,0 +1,5 @@ +pub mod attribute; +pub mod dimension; +pub mod domain; +pub mod enumeration; +pub mod schema; diff --git a/tiledb/pod/src/array/schema/mod.rs b/tiledb/pod/src/array/schema/mod.rs new file mode 100644 index 00000000..1b8d3008 --- /dev/null +++ b/tiledb/pod/src/array/schema/mod.rs @@ -0,0 +1,183 @@ +#[cfg(feature = "option-subset")] +use tiledb_utils::option::OptionSubset; + +#[cfg(any(test, feature = "proptest-strategies"))] +pub mod strategy; + +#[cfg(feature = "serde")] +use serde::{Deserialize, Serialize}; + +use tiledb_common::array::{ArrayType, CellOrder, CellValNum, TileOrder}; +use tiledb_common::datatype::Datatype; +use tiledb_common::filter::FilterData; +use tiledb_common::key::LookupKey; + +use crate::array::attribute::AttributeData; +use crate::array::dimension::DimensionData; +use crate::array::domain::DomainData; + +/// Encapsulation of data needed to construct a Schema +#[derive(Clone, Default, Debug, PartialEq)] +#[cfg_attr(feature = "option-subset", derive(OptionSubset))] +#[cfg_attr(feature = "serde", derive(Deserialize, Serialize))] +pub struct SchemaData { + pub array_type: ArrayType, + pub domain: DomainData, + pub capacity: Option, + pub cell_order: Option, + pub tile_order: Option, + pub allow_duplicates: Option, + pub attributes: Vec, + pub coordinate_filters: Vec, + pub offsets_filters: Vec, + pub nullity_filters: Vec, +} + +impl SchemaData { + pub const DEFAULT_SPARSE_TILE_CAPACITY: u64 = 10000; + + pub fn num_fields(&self) -> usize { + self.domain.dimension.len() + self.attributes.len() + } + + pub fn field>(&self, key: K) -> Option { + match key.into() { + LookupKey::Index(idx) => { + if idx < self.domain.dimension.len() { + Some(FieldData::from(self.domain.dimension[idx].clone())) + } else if idx + < self.domain.dimension.len() + self.attributes.len() + { + Some(FieldData::from( + self.attributes[idx - self.domain.dimension.len()] + .clone(), + )) + } else { + None + } + } + LookupKey::Name(name) => { + for d in self.domain.dimension.iter() { + if d.name == name { + return Some(FieldData::from(d.clone())); + } + } + for a in self.attributes.iter() { + if a.name == name { + return Some(FieldData::from(a.clone())); + } + } + None + } + } + } + + pub fn fields(&self) -> FieldDataIter { + FieldDataIter::new(self) + } + + /// Returns the number of cells per tile + pub fn num_cells_per_tile(&self) -> usize { + match self.array_type { + ArrayType::Dense => { + // it should be safe to unwrap, the two `None` conditions must not + // be satisfied for a dense array domain + // (TODO: what about for string ascii dense domains?) + self.domain.num_cells_per_tile().unwrap() + } + ArrayType::Sparse => { + self.capacity.unwrap_or(Self::DEFAULT_SPARSE_TILE_CAPACITY) + as usize + } + } + } +} + +#[derive(Clone, Debug, PartialEq)] +#[cfg_attr(feature = "option-subset", derive(OptionSubset))] +#[cfg_attr(feature = "serde", derive(Deserialize, Serialize))] +pub enum FieldData { + Dimension(DimensionData), + Attribute(AttributeData), +} + +impl FieldData { + pub fn is_attribute(&self) -> bool { + matches!(self, Self::Attribute(_)) + } + + pub fn is_dimension(&self) -> bool { + matches!(self, Self::Dimension(_)) + } + + pub fn name(&self) -> &str { + match self { + Self::Dimension(d) => &d.name, + Self::Attribute(a) => &a.name, + } + } + + pub fn datatype(&self) -> Datatype { + match self { + Self::Dimension(d) => d.datatype, + Self::Attribute(a) => a.datatype, + } + } + + pub fn cell_val_num(&self) -> Option { + match self { + Self::Dimension(d) => Some(d.cell_val_num()), + Self::Attribute(a) => a.cell_val_num, + } + } + + pub fn nullability(&self) -> Option { + match self { + Self::Dimension(_) => Some(false), + Self::Attribute(a) => a.nullability, + } + } +} + +impl From for FieldData { + fn from(attr: AttributeData) -> Self { + FieldData::Attribute(attr) + } +} + +impl From for FieldData { + fn from(dim: DimensionData) -> Self { + FieldData::Dimension(dim) + } +} + +pub struct FieldDataIter<'a> { + schema: &'a SchemaData, + cursor: usize, +} + +impl<'a> FieldDataIter<'a> { + pub fn new(schema: &'a SchemaData) -> Self { + FieldDataIter { schema, cursor: 0 } + } +} + +impl Iterator for FieldDataIter<'_> { + type Item = FieldData; + fn next(&mut self) -> Option { + if self.cursor < self.schema.num_fields() { + let item = self.schema.field(self.cursor); + self.cursor += 1; + Some(item.expect("Internal indexing error")) + } else { + None + } + } + + fn size_hint(&self) -> (usize, Option) { + let exact = self.schema.num_fields() - self.cursor; + (exact, Some(exact)) + } +} + +impl std::iter::FusedIterator for FieldDataIter<'_> {} diff --git a/tiledb/api/src/array/schema/strategy.rs b/tiledb/pod/src/array/schema/strategy.rs similarity index 77% rename from tiledb/api/src/array/schema/strategy.rs rename to tiledb/pod/src/array/schema/strategy.rs index a0deed1c..59ba1422 100644 --- a/tiledb/api/src/array/schema/strategy.rs +++ b/tiledb/pod/src/array/schema/strategy.rs @@ -1,39 +1,30 @@ use std::collections::HashSet; -use std::num::NonZeroU32; use std::rc::Rc; use proptest::prelude::*; use proptest::sample::select; use proptest::strategy::ValueTree; -use tiledb_test_utils::strategy::records::RecordsValueTree; -use tiledb_test_utils::strategy::StrategyExt; +use strategy_ext::records::RecordsValueTree; +use strategy_ext::StrategyExt; +use tiledb_common::array::{ArrayType, CellOrder, TileOrder}; +use tiledb_common::filter::FilterData; use crate::array::attribute::strategy::{ prop_attribute, AttributeValueTree, Requirements as AttributeRequirements, StrategyContext as AttributeContext, }; +use crate::array::attribute::AttributeData; +use crate::array::dimension::DimensionData; use crate::array::domain::strategy::{ DomainValueTree, Requirements as DomainRequirements, }; -use crate::array::{ - schema::FieldData, ArrayType, AttributeData, CellOrder, CellValNum, - DimensionData, DomainData, SchemaData, TileOrder, -}; -use crate::filter::list::FilterListData; +use crate::array::domain::DomainData; +use crate::array::schema::{FieldData, SchemaData}; use crate::filter::strategy::{ - FilterPipelineValueTree, Requirements as FilterRequirements, - StrategyContext as FilterContext, + FilterPipelineStrategy, FilterPipelineValueTree, + Requirements as FilterRequirements, StrategyContext as FilterContext, }; -impl Arbitrary for ArrayType { - type Parameters = (); - type Strategy = BoxedStrategy; - - fn arbitrary_with(_: Self::Parameters) -> Self::Strategy { - prop_oneof![Just(ArrayType::Dense), Just(ArrayType::Sparse)].boxed() - } -} - #[derive(Clone)] pub struct Requirements { pub domain: Option>, @@ -47,19 +38,19 @@ pub struct Requirements { impl Requirements { pub fn min_attributes_default() -> usize { - **crate::strategy::config::TILEDB_STRATEGY_SCHEMA_PARAMETERS_ATTRIBUTES_MIN + **tiledb_proptest_config::TILEDB_STRATEGY_SCHEMA_PARAMETERS_ATTRIBUTES_MIN } pub fn max_attributes_default() -> usize { - **crate::strategy::config::TILEDB_STRATEGY_SCHEMA_PARAMETERS_ATTRIBUTES_MAX + **tiledb_proptest_config::TILEDB_STRATEGY_SCHEMA_PARAMETERS_ATTRIBUTES_MAX } pub fn min_sparse_tile_capacity_default() -> u64 { - **crate::strategy::config::TILEDB_STRATEGY_SCHEMA_PARAMETERS_SPARSE_TILE_CAPACITY_MIN + **tiledb_proptest_config::TILEDB_STRATEGY_SCHEMA_PARAMETERS_SPARSE_TILE_CAPACITY_MIN } pub fn max_sparse_tile_capacity_default() -> u64 { - **crate::strategy::config::TILEDB_STRATEGY_SCHEMA_PARAMETERS_SPARSE_TILE_CAPACITY_MIN + **tiledb_proptest_config::TILEDB_STRATEGY_SCHEMA_PARAMETERS_SPARSE_TILE_CAPACITY_MIN } } @@ -79,62 +70,10 @@ impl Default for Requirements { } } -impl Arbitrary for CellValNum { - type Strategy = BoxedStrategy; - type Parameters = Option>; - - fn arbitrary_with(r: Self::Parameters) -> Self::Strategy { - if let Some(range) = r { - (range.start.get()..range.end.get()) - .prop_map(|nz| CellValNum::try_from(nz).unwrap()) - .boxed() - } else { - prop_oneof![ - 30 => Just(CellValNum::single()), - 30 => Just(CellValNum::Var), - 25 => (2u32..=8).prop_map(|nz| CellValNum::try_from(nz).unwrap()), - 10 => (9u32..=16).prop_map(|nz| CellValNum::try_from(nz).unwrap()), - 3 => (17u32..=32).prop_map(|nz| CellValNum::try_from(nz).unwrap()), - 2 => (33u32..=64).prop_map(|nz| CellValNum::try_from(nz).unwrap()), - // NB: large fixed CellValNums don't really reflect production use cases - // and are not well tested, and are known to cause problems - ].boxed() - } - } -} - -impl Arbitrary for CellOrder { - type Strategy = BoxedStrategy; - type Parameters = Option; - - fn arbitrary_with(args: Self::Parameters) -> Self::Strategy { - match args { - None => prop_oneof![ - Just(CellOrder::Unordered), - Just(CellOrder::RowMajor), - Just(CellOrder::ColumnMajor), - Just(CellOrder::Hilbert), - ] - .boxed(), - Some(ArrayType::Sparse) => prop_oneof![ - Just(CellOrder::RowMajor), - Just(CellOrder::ColumnMajor), - Just(CellOrder::Hilbert), - ] - .boxed(), - Some(ArrayType::Dense) => prop_oneof![ - Just(CellOrder::RowMajor), - Just(CellOrder::ColumnMajor), - ] - .boxed(), - } - } -} - pub fn prop_coordinate_filters( domain: &DomainData, params: &Requirements, -) -> impl Strategy { +) -> impl Strategy> { let req = FilterRequirements { context: Some(FilterContext::SchemaCoordinates(Rc::new( domain.clone(), @@ -145,7 +84,7 @@ pub fn prop_coordinate_filters( .map(|rc| rc.as_ref().clone()) .unwrap_or_default() }; - any_with::(Rc::new(req)) + FilterPipelineStrategy::new(Rc::new(req)) } fn prop_schema_for_domain( @@ -198,8 +137,8 @@ fn prop_schema_for_domain( params.num_attributes.clone() ), prop_coordinate_filters(&domain, params.as_ref()), - any_with::(offsets_filters_requirements), - any_with::(validity_filters_requirements) + FilterPipelineStrategy::new(offsets_filters_requirements), + FilterPipelineStrategy::new(validity_filters_requirements) ) .prop_map( move |( @@ -452,33 +391,6 @@ impl ValueTree for SchemaValueTree { #[cfg(test)] mod tests { use super::*; - use crate::{Context, Factory}; - use util::option::OptionSubset; - - /// Test that the arbitrary schema construction always succeeds - #[test] - fn schema_arbitrary() { - let ctx = Context::new().expect("Error creating context"); - - proptest!(|(maybe_schema in any::())| { - maybe_schema.create(&ctx) - .expect("Error constructing arbitrary schema"); - }); - } - - #[test] - fn schema_eq_reflexivity() { - let ctx = Context::new().expect("Error creating context"); - - proptest!(|(schema in any::())| { - assert_eq!(schema, schema); - assert!(schema.option_subset(&schema)); - - let schema = schema.create(&ctx) - .expect("Error constructing arbitrary schema"); - assert_eq!(schema, schema); - }); - } /// Runs one instance of [schema_value_tree] fn test_schema_value_tree(mut vt: SchemaValueTree) { diff --git a/tiledb/pod/src/filter/mod.rs b/tiledb/pod/src/filter/mod.rs new file mode 100644 index 00000000..cd8f4d49 --- /dev/null +++ b/tiledb/pod/src/filter/mod.rs @@ -0,0 +1,2 @@ +#[cfg(any(test, feature = "proptest-strategies"))] +pub mod strategy; diff --git a/tiledb/api/src/filter/strategy.rs b/tiledb/pod/src/filter/strategy.rs similarity index 81% rename from tiledb/api/src/filter/strategy.rs rename to tiledb/pod/src/filter/strategy.rs index 77549006..3a19a8d9 100644 --- a/tiledb/api/src/filter/strategy.rs +++ b/tiledb/pod/src/filter/strategy.rs @@ -3,13 +3,14 @@ use std::rc::Rc; use proptest::prelude::*; use proptest::strategy::{NewTree, ValueTree}; use proptest::test_runner::TestRunner; -use tiledb_test_utils::strategy::sequence::SequenceValueTree; +use strategy_ext::sequence::SequenceValueTree; +use tiledb_common::array::{ArrayType, CellValNum}; +use tiledb_common::datatype::strategy::DatatypeContext; +use tiledb_common::datatype::Datatype; +use tiledb_common::dimension_constraints_go; +use tiledb_common::filter::*; -use crate::array::{ArrayType, CellValNum, DomainData}; -use crate::datatype::strategy::DatatypeContext; -use crate::dimension_constraints_go; -use crate::filter::list::FilterListData; -use crate::filter::*; +use crate::array::domain::DomainData; #[derive(Clone, Debug)] pub enum StrategyContext { @@ -428,7 +429,7 @@ pub fn prop_filter( None => true, Some(dt) => { [std::mem::size_of::(), std::mem::size_of::()] - .contains(&(dt.size() as usize)) + .contains(&dt.size()) } }; if ok_scale_float { @@ -442,7 +443,7 @@ pub fn prop_filter( let ok_xor = requirements.allow_xor && match requirements.input_datatype { Some(input_datatype) => { - [1, 2, 4, 8].contains(&(input_datatype.size() as usize)) + [1, 2, 4, 8].contains(&input_datatype.size()) } None => true, }; @@ -469,18 +470,18 @@ pub struct FilterPipelineValueTree { } impl FilterPipelineValueTree { - pub fn new(init: FilterListData) -> Self { + pub fn new(init: Vec) -> Self { FilterPipelineValueTree { - inner: SequenceValueTree::new(init.into_inner()), + inner: SequenceValueTree::new(init), } } } impl ValueTree for FilterPipelineValueTree { - type Value = FilterListData; + type Value = Vec; fn current(&self) -> Self::Value { - FilterListData::from(self.inner.current()) + self.inner.current() } fn simplify(&mut self) -> bool { @@ -492,14 +493,20 @@ impl ValueTree for FilterPipelineValueTree { } } -#[derive(Debug)] +#[derive(Debug, Default)] pub struct FilterPipelineStrategy { requirements: Rc, } +impl FilterPipelineStrategy { + pub fn new(requirements: Rc) -> Self { + Self { requirements } + } +} + impl Strategy for FilterPipelineStrategy { type Tree = FilterPipelineValueTree; - type Value = FilterListData; + type Value = Vec; fn new_tree(&self, runner: &mut TestRunner) -> NewTree { const MIN_FILTERS: usize = 0; @@ -537,112 +544,20 @@ impl Strategy for FilterPipelineStrategy { filters.push(f); } - filters.into_iter().collect::() + filters.into_iter().collect::>() }; Ok(FilterPipelineValueTree::new(initial_pipeline)) } } -impl Arbitrary for FilterListData { - type Parameters = Rc; - type Strategy = FilterPipelineStrategy; - - fn arbitrary_with(args: Self::Parameters) -> Self::Strategy { - FilterPipelineStrategy { - requirements: Rc::clone(&args), - } - } -} - #[cfg(test)] mod tests { use super::*; - use crate::Factory; - use util::assert_option_subset; - - #[test] - /// Test that the arbitrary filter construction always succeeds - fn filter_arbitrary() { - let ctx = Context::new().expect("Error creating context"); - - proptest!(|(filt in any::())| { - filt.create(&ctx).expect("Error constructing arbitrary filter"); - }); - } - - /// Test that the arbitrary filter construction always succeeds with a - /// supplied datatype - #[test] - fn filter_arbitrary_for_datatype() { - let ctx = Context::new().expect("Error creating context"); - - let strat = any::().prop_flat_map(|dt| { - ( - Just(dt), - prop_filter(Rc::new(Requirements { - input_datatype: Some(dt), - ..Default::default() - })), - ) - }); - - proptest!(|((dt, filt) in strat)| { - let filt = filt.create(&ctx) - .expect("Error constructing arbitrary filter"); - - let filt_data = filt.filter_data() - .expect("Error reading filter data"); - assert!(filt_data.transform_datatype(&dt).is_some()); - }); - } - - #[test] - /// Test that the arbitrary filter list construction always succeeds - fn filter_list_arbitrary() { - let ctx = Context::new().expect("Error creating context"); - - proptest!(|(fl in any::())| { - fl.create(&ctx).expect("Error constructing arbitrary filter list"); - }); - } - - #[test] - /// Test that the arbitrary filter list construction always succeeds with a - /// supplied datatype - fn filter_list_arbitrary_for_datatype() { - let ctx = Context::new().expect("Error creating context"); - - let strat = any::().prop_flat_map(|dt| { - let req = Rc::new(Requirements { - input_datatype: Some(dt), - ..Default::default() - }); - (Just(dt), any_with::(req)) - }); - - proptest!(|((dt, fl) in strat)| { - let fl = fl.create(&ctx) - .expect("Error constructing arbitrary filter"); - - let mut current_dt = dt; - - let fl = fl.to_vec().expect("Error collecting filters"); - for (fi, f) in fl.iter().enumerate() { - if let Some(next_dt) = f.filter_data() - .expect("Error reading filter data") - .transform_datatype(¤t_dt) { - current_dt = next_dt - } else { - panic!("Constructed invalid filter list for datatype {}: \ - {:?}, invalid at position {}", dt, fl, fi) - } - } - }); - } /// Test that ScaleFloat serialization is invertible, because floating /// point sadness + #[cfg(feature = "serde")] #[test] fn filter_scalefloat_serde() { proptest!(|(scalefloat_in in prop_scalefloat())| { @@ -654,24 +569,10 @@ mod tests { }); } - #[test] - fn filter_eq_reflexivity() { - let ctx = Context::new().expect("Error creating context"); - - proptest!(|(pipeline in any::())| { - assert_eq!(pipeline, pipeline); - assert_option_subset!(pipeline, pipeline); - - let pipeline = pipeline.create(&ctx) - .expect("Error constructing arbitrary filter"); - assert_eq!(pipeline, pipeline); - }); - } - /// Ensure that filter pipelines can shrink #[test] fn pipeline_shrinking() { - let strat = any::(); + let strat = FilterPipelineStrategy::default(); let mut runner = proptest::test_runner::TestRunner::new(Default::default()); diff --git a/tiledb/pod/src/lib.rs b/tiledb/pod/src/lib.rs new file mode 100644 index 00000000..de100929 --- /dev/null +++ b/tiledb/pod/src/lib.rs @@ -0,0 +1,36 @@ +//! Provides "plain old data" representations of tiledb data structures. +//! +//! "Plain old data" is used to describe types in C++ programming which +//! have no constructors, destructors, or virtual member functions. +//! Values of these types can be duplicated by copying bits. +//! +//! The structures defined in this crate are representations of tiledb +//! logical structures which expose their attributes as public fields. +//! This contrasts with the [tiledb-api] crate where the structures +//! are instead handles to tiledb C API data structures. +//! +//! There is no direct "plain old data" analogue in Rust; the use +//! of the term in this crate intends to capture the spirit rather than the letter +//! of what it means to be plain old data. +//! +//! The structures in this crate can be used to construct, inspect, or manipulate `libtiledb` +//! data structures without invoking tiledb's C API. This allows an +//! application to be built without linking against libtiledb. +//! This might be desirable for embedding a description of a +//! tiledb schema in a remote procedure call, for example. +//! +//! ## Features +//! +//! * `proptest-strategies`: Provides `proptest::arbitrary::Arbitrary` implementations for many of +//! the structures defined in this crate for use with +//! [property-based testing](https://proptest-rs.github.io/proptest/intro.html). +//! * `serde`: Provides `serde::Deserialize` and `serde::Serialize` implemenations for many +//! of the structures defined in this crate. +#[cfg(feature = "option-subset")] +#[macro_use] +extern crate tiledb_proc_macro; +extern crate tiledb_sys as ffi; + +pub mod array; +pub mod filter; +pub mod query; diff --git a/tiledb/pod/src/query/mod.rs b/tiledb/pod/src/query/mod.rs new file mode 100644 index 00000000..46f949f6 --- /dev/null +++ b/tiledb/pod/src/query/mod.rs @@ -0,0 +1 @@ +pub mod subarray; diff --git a/tiledb/pod/src/query/subarray.rs b/tiledb/pod/src/query/subarray.rs new file mode 100644 index 00000000..276657aa --- /dev/null +++ b/tiledb/pod/src/query/subarray.rs @@ -0,0 +1,378 @@ +use itertools::Itertools; +use tiledb_common::range::Range; + +/// Encapsulates data for a subarray. +#[derive(Clone, Debug, Eq, Hash, PartialEq)] +pub struct SubarrayData { + /// List of requested ranges on each dimension. + /// The outer `Vec` is the list of dimensions and the inner `Vec` + /// is the list of requested ranges for that dimension. + /// If a list is empty for a dimension, then all the coordinates + /// of that dimension are selected. + pub dimension_ranges: Vec>, +} + +impl SubarrayData { + /// Returns a new `SubarrayData` which represents the intersection + /// of all the ranges of `self` with a new set of `ranges` on each dimension. + /// + /// If any dimension does not have any intersection with `ranges`, then + /// this returns `None` as the resulting subarray would select no coordinates. + pub fn intersect_ranges(&self, ranges: &[Range]) -> Option { + let updated_ranges = self + .dimension_ranges + .iter() + .zip(ranges.iter()) + .map(|(current_ranges, new_range)| { + if current_ranges.is_empty() { + // empty means select the whole thing + vec![new_range.clone()] + } else { + current_ranges + .iter() + .filter_map(|current_range| { + current_range.intersection(new_range) + }) + .collect::>() + } + }) + .collect::>>(); + + if updated_ranges.iter().any(|dim| dim.is_empty()) { + None + } else { + Some(SubarrayData { + dimension_ranges: updated_ranges, + }) + } + } + /// Returns a new `SubarrayData` which represents the intersection + /// of all the ranges of `self` with all of the ranges of `other` on each dimension. + /// + /// ``` + /// use tiledb_common::range::Range; + /// use tiledb_pod::query::subarray::SubarrayData; + /// + /// let s1 = SubarrayData { + /// dimension_ranges: vec![ + /// vec![Range::from(&[0, 100]), Range::from(&[200, 300])], + /// vec![Range::from(&[2, 6]), Range::from(&[8, 12])], + /// vec![Range::from(&[20, 30]), Range::from(&[40, 50])] + /// ] + /// }; + /// let s2 = SubarrayData { + /// dimension_ranges: vec![ + /// vec![Range::from(&[150, 250])], + /// vec![Range::from(&[4, 10]), Range::from(&[12, 12])], + /// vec![Range::from(&[25, 45])] + /// ] + /// }; + /// let intersection = s1.intersect(&s2); + /// + /// assert_eq!(intersection, Some(SubarrayData { + /// dimension_ranges: vec![ + /// vec![Range::from(&[200, 250])], + /// vec![Range::from(&[4, 6]), Range::from(&[8, 10]), Range::from(&[12, 12])], + /// vec![Range::from(&[25, 30]), Range::from(&[40, 45])] + /// ] + /// })); + /// ``` + /// + /// If any dimension does not have any intersection, then this returns `None` + /// as the resulting subarray would select no coordinates. + /// ``` + /// use tiledb_common::range::Range; + /// use tiledb_pod::query::subarray::SubarrayData; + /// + /// let s1 = SubarrayData { + /// dimension_ranges: vec![ + /// vec![Range::from(&[50, 100]), Range::from(&[400, 450])] + /// ] + /// }; + /// let s2 = SubarrayData { + /// dimension_ranges: vec![ + /// vec![Range::from(&[150, 250]), Range::from(&[300, 350])], + /// ] + /// }; + /// let intersection = s1.intersect(&s2); + /// assert_eq!(intersection, None); + /// ``` + /// + /// If a dimension in `self` (without loss of generality) has no ranges, + /// then it is a special case which means to select the all coordinates. + /// The intersection is equal to the ranges of `other`. + /// ``` + /// use tiledb_common::range::Range; + /// use tiledb_pod::query::subarray::SubarrayData; + /// + /// let s1 = SubarrayData { + /// dimension_ranges: vec![ + /// vec![] + /// ] + /// }; + /// let s2 = SubarrayData { + /// dimension_ranges: vec![ + /// vec![Range::from(&[150, 250]), Range::from(&[300, 350])], + /// ] + /// }; + /// let intersection = s1.intersect(&s2); + /// assert_eq!(intersection, Some(s2.clone())); + /// ``` + pub fn intersect(&self, other: &SubarrayData) -> Option { + let updated_ranges = self + .dimension_ranges + .iter() + .zip(other.dimension_ranges.iter()) + .map(|(my_dimension, their_dimension)| { + if my_dimension.is_empty() { + // empty means select all coordinates + their_dimension.clone() + } else if their_dimension.is_empty() { + // empty means select all coordinates + my_dimension.clone() + } else { + my_dimension + .iter() + .cartesian_product(their_dimension.iter()) + .filter_map(|(rm, rt)| rm.intersection(rt)) + .collect::>() + } + }) + .collect::>>(); + + if updated_ranges.iter().any(|dim| dim.is_empty()) { + None + } else { + Some(SubarrayData { + dimension_ranges: updated_ranges, + }) + } + } +} + +#[cfg(any(test, feature = "proptest-strategies"))] +pub mod strategy { + use std::rc::Rc; + + use proptest::prelude::*; + + use super::*; + use crate::array::schema::SchemaData; + + impl Arbitrary for SubarrayData { + type Parameters = Option>; + type Strategy = BoxedStrategy; + + fn arbitrary_with(params: Self::Parameters) -> Self::Strategy { + let strat_dimension_ranges = if let Some(schema) = params { + schema + .domain + .dimension + .iter() + .map(|d| d.subarray_strategy(None).unwrap()) + .collect::>>() + } else { + todo!() + }; + + const DIMENSION_MIN_RANGES: usize = 0; + const DIMENSION_MAX_RANGES: usize = 4; + + strat_dimension_ranges + .into_iter() + .map(|strat_range| { + proptest::collection::vec( + strat_range, + DIMENSION_MIN_RANGES..=DIMENSION_MAX_RANGES, + ) + .boxed() + }) + .collect::>>>() + .prop_map(|dimension_ranges| SubarrayData { dimension_ranges }) + .boxed() + } + } +} + +#[cfg(test)] +mod tests { + use std::rc::Rc; + + use itertools::izip; + use proptest::prelude::*; + + use super::*; + use crate::array::domain::strategy::Requirements as DomainRequirements; + use crate::array::schema::strategy::Requirements as SchemaRequirements; + use crate::array::schema::SchemaData; + + fn do_subarray_intersect_ranges(subarray: &SubarrayData, ranges: &[Range]) { + if let Some(intersection) = subarray.intersect_ranges(ranges) { + assert_eq!( + subarray.dimension_ranges.len(), + intersection.dimension_ranges.len() + ); + assert_eq!(subarray.dimension_ranges.len(), ranges.len()); + + for (before, after, update) in izip!( + subarray.dimension_ranges.iter(), + intersection.dimension_ranges.iter(), + ranges.iter() + ) { + if before.is_empty() { + assert_eq!(vec![update.clone()], *after); + continue; + } + + assert!(after.len() <= before.len()); + + let mut r_after = after.iter(); + for r_before in before.iter() { + if let Some(r) = r_before.intersection(update) { + assert_eq!(*r_after.next().unwrap(), r); + } + } + assert_eq!(None, r_after.next()); + } + } else { + // for at least one dimension, none of the ranges could have intersected + let found_empty_intersection = subarray + .dimension_ranges + .iter() + .zip(ranges.iter()) + .any(|(current, new)| { + if current.is_empty() { + false + } else { + current.iter().all(|r| r.intersection(new).is_none()) + } + }); + assert!( + found_empty_intersection, + "dimensions: {:?}", + subarray + .dimension_ranges + .iter() + .zip(ranges.iter()) + .map(|(d, r)| format!( + "({:?} && {:?} = {:?}", + d, + r, + d.iter() + .map(|dr| dr.intersection(r)) + .collect::>>() + )) + .collect::>() + ); + } + } + + /// Validate the intersection of two subarrays. + /// `s1` and `s2` are two subarrays for the same schema. + fn do_subarray_intersect_subarray(s1: &SubarrayData, s2: &SubarrayData) { + if let Some(intersection) = s1.intersect(s2) { + for (di, ds1, ds2) in izip!( + intersection.dimension_ranges.iter(), + s1.dimension_ranges.iter(), + s2.dimension_ranges.iter(), + ) { + if ds1.is_empty() { + assert_eq!(di, ds2); + continue; + } else if ds2.is_empty() { + assert_eq!(di, ds1); + continue; + } + // there must be some pair from (rs1, rs2) where di is the intersection + for ri in di.iter() { + let found_input = ds1 + .iter() + .cartesian_product(ds2.iter()) + .any(|(rs1, rs2)| { + Some(ri) == rs1.intersection(rs2).as_ref() + }); + assert!(found_input, "ri = {:?}", ri); + } + + // and for all pairs (rs1, rs2), there must be some ri which covers + for (rs1, rs2) in ds1.iter().cartesian_product(ds2.iter()) { + let Some(intersection) = rs1.intersection(rs2) else { + continue; + }; + + let found_output = di.iter().any(|ri| intersection == *ri); + assert!( + found_output, + "rs1 = {:?}, rs2 = {:?}, intersection = {:?}", + rs1, rs2, intersection + ); + } + } + } else { + // for each least one dimension, none of the ranges of `s1` + // intersected with any range from `s2` + let found_empty_intersection = s1 + .dimension_ranges + .iter() + .zip(s2.dimension_ranges.iter()) + .any(|(ds1, ds2)| { + ds1.iter() + .cartesian_product(ds2.iter()) + .all(|(rs1, rs2)| rs1.intersection(rs2).is_none()) + }); + assert!(found_empty_intersection); + } + } + + fn strat_subarray_intersect_ranges( + ) -> impl Strategy)> { + let req = Rc::new(SchemaRequirements { + domain: Some(Rc::new(DomainRequirements { + num_dimensions: 1..=1, + ..Default::default() + })), + ..Default::default() + }); + + any_with::(req).prop_flat_map(|schema| { + let schema = Rc::new(schema); + ( + any_with::(Some(Rc::clone(&schema))), + schema.domain.subarray_strategy(), + ) + }) + } + + fn strat_subarray_intersect_subarray( + ) -> impl Strategy { + use crate::array::domain::strategy::Requirements as DomainRequirements; + use crate::array::schema::strategy::Requirements as SchemaRequirements; + + let req = Rc::new(SchemaRequirements { + domain: Some(Rc::new(DomainRequirements { + num_dimensions: 1..=1, + ..Default::default() + })), + ..Default::default() + }); + + any_with::(req).prop_flat_map(|schema| { + let schema = Rc::new(schema); + let strat_subarray = + any_with::(Some(Rc::clone(&schema))); + (strat_subarray.clone(), strat_subarray.clone()) + }) + } + + proptest! { + #[test] + fn subarray_intersect_ranges((subarray, range) in strat_subarray_intersect_ranges()) { + do_subarray_intersect_ranges(&subarray, &range) + } + + #[test] + fn subarray_intersect_subarray((s1, s2) in strat_subarray_intersect_subarray()) { + do_subarray_intersect_subarray(&s1, &s2) + } + } +} diff --git a/tiledb/queries/Cargo.toml b/tiledb/queries/Cargo.toml index 3cd84348..b7e1b2a4 100644 --- a/tiledb/queries/Cargo.toml +++ b/tiledb/queries/Cargo.toml @@ -5,7 +5,8 @@ rust-version.workspace = true version.workspace = true [dependencies] -tiledb = { workspace = true } +tiledb-api = { workspace = true } +tiledb-common = { workspace = true } [build-dependencies] pkg-config = { workspace = true } diff --git a/tiledb/queries/examples/aggregate-adapters.rs b/tiledb/queries/examples/aggregate-adapters.rs index 6661f61d..b356da57 100644 --- a/tiledb/queries/examples/aggregate-adapters.rs +++ b/tiledb/queries/examples/aggregate-adapters.rs @@ -1,12 +1,20 @@ -extern crate tiledb; +extern crate tiledb_api; +extern crate tiledb_common; extern crate tiledb_query_adapters; use std::path::PathBuf; -use tiledb::datatype::PhysicalValue; -use tiledb::query::read::AggregateFunction; -use tiledb::query::{QueryBuilder, ReadQuery}; -use tiledb::Datatype; -use tiledb::Result as TileDBResult; + +use tiledb_api::array::{ + Array, AttributeBuilder, Dimension, DimensionBuilder, DomainBuilder, + SchemaBuilder, +}; +use tiledb_api::query::read::AggregateFunction; +use tiledb_api::query::{ + QueryBuilder, QueryLayout, ReadBuilder, ReadQuery, WriteBuilder, +}; +use tiledb_api::{Context, Result as TileDBResult}; +use tiledb_common::array::{ArrayType, Mode}; +use tiledb_common::datatype::{Datatype, PhysicalValue}; use tiledb_query_adapters::AggregateQueryBuilderExt; const AGGREGATE_ARRAY_URI: &str = "aggregates"; @@ -44,12 +52,12 @@ fn main() { /// Returns whether the example array already exists fn array_exists() -> bool { - let tdb = match tiledb::context::Context::new() { + let tdb = match Context::new() { Err(_) => return false, Ok(tdb) => tdb, }; - tiledb::array::Array::exists(&tdb, AGGREGATE_ARRAY_URI) + Array::exists(&tdb, AGGREGATE_ARRAY_URI) .expect("Error checking array existence") } @@ -60,48 +68,35 @@ fn array_exists() -> bool { /// span all 4 elements on each dimension. /// Hence we have 16 cells of data and a single tile for the whole array. fn create_array() -> TileDBResult<()> { - let tdb = tiledb::context::Context::new()?; + let tdb = Context::new()?; let domain = { - let rows: tiledb::array::Dimension = - tiledb::array::DimensionBuilder::new( - &tdb, - "rows", - Datatype::Int32, - ([1, 4], 4), - )? - .build(); - let cols: tiledb::array::Dimension = - tiledb::array::DimensionBuilder::new( - &tdb, - "columns", - Datatype::Int32, - ([1, 4], 4), - )? - .build(); + let rows: Dimension = + DimensionBuilder::new(&tdb, "rows", Datatype::Int32, ([1, 4], 4))? + .build(); + let cols: Dimension = DimensionBuilder::new( + &tdb, + "columns", + Datatype::Int32, + ([1, 4], 4), + )? + .build(); - tiledb::array::DomainBuilder::new(&tdb)? + DomainBuilder::new(&tdb)? .add_dimension(rows)? .add_dimension(cols)? .build() }; - let attribute_a = tiledb::array::AttributeBuilder::new( - &tdb, - AGGREGATE_ATTRIBUTE_NAME, - tiledb::Datatype::Int32, - )? - .build(); - - let schema = tiledb::array::SchemaBuilder::new( - &tdb, - tiledb::array::ArrayType::Dense, - domain, - )? - .add_attribute(attribute_a)? - .build()?; - - tiledb::Array::create(&tdb, AGGREGATE_ARRAY_URI, schema) + let attribute_a = + AttributeBuilder::new(&tdb, AGGREGATE_ATTRIBUTE_NAME, Datatype::Int32)? + .build(); + + let schema = SchemaBuilder::new(&tdb, ArrayType::Dense, domain)? + .add_attribute(attribute_a)? + .build()?; + + Array::create(&tdb, AGGREGATE_ARRAY_URI, schema) } /// Writes data into the array in row-major order from a 1D-array buffer. @@ -111,18 +106,14 @@ fn create_array() -> TileDBResult<()> { /// [ 9, 10, 11, 12], /// [13, 14, 15, 16]] fn write_array() -> TileDBResult<()> { - let tdb = tiledb::context::Context::new()?; + let tdb = Context::new()?; - let array = tiledb::Array::open( - &tdb, - AGGREGATE_ARRAY_URI, - tiledb::array::Mode::Write, - )?; + let array = Array::open(&tdb, AGGREGATE_ARRAY_URI, Mode::Write)?; let data = vec![1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]; - let query = tiledb::query::WriteBuilder::new(array)? - .layout(tiledb::query::QueryLayout::RowMajor)? + let query = WriteBuilder::new(array)? + .layout(QueryLayout::RowMajor)? .data_typed(AGGREGATE_ATTRIBUTE_NAME, &data)? .build(); @@ -138,16 +129,12 @@ fn write_array() -> TileDBResult<()> { /// [ _, _, _, _]]] /// This should print 6, which is the number of elements in the slice. fn example_count() -> TileDBResult<()> { - let tdb = tiledb::context::Context::new()?; + let tdb = Context::new()?; - let array = tiledb::Array::open( - &tdb, - AGGREGATE_ARRAY_URI, - tiledb::array::Mode::Read, - )?; + let array = Array::open(&tdb, AGGREGATE_ARRAY_URI, Mode::Read)?; - let mut query = tiledb::query::ReadBuilder::new(array)? - .layout(tiledb::query::QueryLayout::RowMajor)? + let mut query = ReadBuilder::new(array)? + .layout(QueryLayout::RowMajor)? .aggregate_physical_value(AggregateFunction::Count)? .start_subarray()? .add_range("rows", &[1i32, 2])? @@ -174,16 +161,12 @@ fn example_count() -> TileDBResult<()> { /// [ _, _, _, _]]] /// This should print 36, which is the sum of elements in the slice. fn example_sum() -> TileDBResult<()> { - let tdb = tiledb::context::Context::new()?; + let tdb = Context::new()?; - let array = tiledb::Array::open( - &tdb, - AGGREGATE_ARRAY_URI, - tiledb::array::Mode::Read, - )?; + let array = Array::open(&tdb, AGGREGATE_ARRAY_URI, Mode::Read)?; - let mut query = tiledb::query::ReadBuilder::new(array)? - .layout(tiledb::query::QueryLayout::RowMajor)? + let mut query = ReadBuilder::new(array)? + .layout(QueryLayout::RowMajor)? .aggregate_physical_value(AggregateFunction::Sum( AGGREGATE_ATTRIBUTE_NAME.to_owned(), ))? @@ -214,16 +197,12 @@ fn example_sum() -> TileDBResult<()> { /// This function also uses the AggregateResultHandle enum to pass the /// result back. fn example_min_max() -> TileDBResult<()> { - let tdb = tiledb::context::Context::new()?; + let tdb = Context::new()?; - let array = tiledb::Array::open( - &tdb, - AGGREGATE_ARRAY_URI, - tiledb::array::Mode::Read, - )?; + let array = Array::open(&tdb, AGGREGATE_ARRAY_URI, Mode::Read)?; - let mut query = tiledb::query::ReadBuilder::new(array)? - .layout(tiledb::query::QueryLayout::RowMajor)? + let mut query = ReadBuilder::new(array)? + .layout(QueryLayout::RowMajor)? .aggregate_physical_value(AggregateFunction::Max( AGGREGATE_ATTRIBUTE_NAME.to_owned(), ))? @@ -256,16 +235,12 @@ fn example_min_max() -> TileDBResult<()> { /// [ _, _, _, _]]] /// This should print 8, which is the mean of the slice. fn example_mean() -> TileDBResult<()> { - let tdb = tiledb::context::Context::new()?; + let tdb = Context::new()?; - let array = tiledb::Array::open( - &tdb, - AGGREGATE_ARRAY_URI, - tiledb::array::Mode::Read, - )?; + let array = Array::open(&tdb, AGGREGATE_ARRAY_URI, Mode::Read)?; - let mut query = tiledb::query::ReadBuilder::new(array)? - .layout(tiledb::query::QueryLayout::RowMajor)? + let mut query = ReadBuilder::new(array)? + .layout(QueryLayout::RowMajor)? .aggregate_physical_value(AggregateFunction::Mean( AGGREGATE_ATTRIBUTE_NAME.to_owned(), ))? diff --git a/tiledb/queries/src/aggregate.rs b/tiledb/queries/src/aggregate.rs index dbb1549d..8138ca0d 100644 --- a/tiledb/queries/src/aggregate.rs +++ b/tiledb/queries/src/aggregate.rs @@ -1,9 +1,11 @@ -use tiledb::datatype::PhysicalValue; -use tiledb::physical_type_go; -use tiledb::query::read::aggregate::*; -use tiledb::query::read::ReadStepOutput; -use tiledb::query::{BuilderBase, Query, QueryBase, QueryBuilder, ReadQuery}; -use tiledb::{Array, Result as TileDBResult}; +use tiledb_api::query::read::aggregate::*; +use tiledb_api::query::read::ReadStepOutput; +use tiledb_api::query::{ + BuilderBase, Query, QueryBase, QueryBuilder, ReadQuery, +}; +use tiledb_api::{Array, Result as TileDBResult}; +use tiledb_common::datatype::PhysicalValue; +use tiledb_common::physical_type_go; /// An `AggregateQueryBuilder` blanket implementation that provides extra adapters /// and methods for running aggregate queries. diff --git a/tiledb/queries/src/lib.rs b/tiledb/queries/src/lib.rs index e1bbe2b7..e8f4cfd0 100644 --- a/tiledb/queries/src/lib.rs +++ b/tiledb/queries/src/lib.rs @@ -3,8 +3,6 @@ //! Import traits from this crate to extend the //! various [`tiledb`] query building traits. -extern crate tiledb; - mod aggregate; pub use self::aggregate::*; diff --git a/tiledb/sys/ignored.rs b/tiledb/sys/ignored.rs index a223be33..19d7eefd 100644 --- a/tiledb/sys/ignored.rs +++ b/tiledb/sys/ignored.rs @@ -12,11 +12,27 @@ pub const TILEDB_VERSION_PATCH: u32 = 0; // This is a list of functions that we are currently planning on not wrapping. extern "C" { - - // The dump functions aren't being wrapped because Rust makes it really easy + // With respect to the crate structure: we want entities which only need + // [tiledb_common] to not need to link to core, and [Datatype] must + // live in [tiledb_common]. + // + // [tiledb_datatype_size] is essentially redundant with matching a Datatype + // to its physical type + pub fn tiledb_datatype_size(type_: tiledb_datatype_t) -> u64; + + // The dump/to_str/from_str functions aren't being wrapped because Rust makes it really easy // to write Debug traits that will dump everything as a JSON string. The dump // functions just write free form ASCII to a file handle which isn't nearly // as useful. + pub fn tiledb_datatype_to_str( + datatype: tiledb_datatype_t, + str_: *mut *const ::std::os::raw::c_char, + ) -> capi_return_t; + + pub fn tiledb_datatype_from_str( + str_: *const ::std::os::raw::c_char, + datatype: *mut tiledb_datatype_t, + ) -> capi_return_t; pub fn tiledb_attribute_dump( ctx: *mut tiledb_ctx_t, diff --git a/tiledb/sys/src/datatype.rs b/tiledb/sys/src/datatype.rs deleted file mode 100644 index 7ac24480..00000000 --- a/tiledb/sys/src/datatype.rs +++ /dev/null @@ -1,16 +0,0 @@ -use crate::capi_enum::tiledb_datatype_t; -use crate::types::capi_return_t; - -extern "C" { - pub fn tiledb_datatype_to_str( - datatype: tiledb_datatype_t, - str_: *mut *const ::std::os::raw::c_char, - ) -> capi_return_t; - - pub fn tiledb_datatype_from_str( - str_: *const ::std::os::raw::c_char, - datatype: *mut tiledb_datatype_t, - ) -> capi_return_t; - - pub fn tiledb_datatype_size(type_: tiledb_datatype_t) -> u64; -} diff --git a/tiledb/sys/src/lib.rs b/tiledb/sys/src/lib.rs index 35851dca..63a64e55 100644 --- a/tiledb/sys/src/lib.rs +++ b/tiledb/sys/src/lib.rs @@ -10,7 +10,6 @@ mod capi_enum; mod config; mod constants; mod context; -mod datatype; mod dimension; mod domain; mod encryption; @@ -42,7 +41,6 @@ pub use capi_enum::*; pub use config::*; pub use constants::*; pub use context::*; -pub use datatype::*; pub use dimension::*; pub use domain::*; pub use encryption::*; diff --git a/tiledb/test-utils/Cargo.toml b/tiledb/test-utils/Cargo.toml deleted file mode 100644 index a9981300..00000000 --- a/tiledb/test-utils/Cargo.toml +++ /dev/null @@ -1,15 +0,0 @@ -[package] -name = "tiledb-test-utils" -edition.workspace = true -rust-version.workspace = true -version.workspace = true - -[dependencies] -anyhow = { workspace = true } -nix = { version = "0", features = ["signal"], optional = true } -proptest = { workspace = true } -tempfile = "3" - -[features] -default = ["signal"] -signal = ["dep:nix"] diff --git a/tiledb/test-utils/src/lib.rs b/tiledb/test-utils/src/lib.rs deleted file mode 100644 index d355ba98..00000000 --- a/tiledb/test-utils/src/lib.rs +++ /dev/null @@ -1,11 +0,0 @@ -extern crate proptest; - -#[cfg(feature = "signal")] -pub mod signal; -pub mod strategy; -pub mod uri_generators; - -#[cfg(feature = "signal")] -pub use signal::*; - -pub use uri_generators::{get_uri_generator, TestArrayUri}; diff --git a/tiledb/utils/src/option.rs b/tiledb/utils/src/option.rs index 27fc9d9d..53038184 100644 --- a/tiledb/utils/src/option.rs +++ b/tiledb/utils/src/option.rs @@ -37,7 +37,8 @@ macro_rules! option_subset_partialeq { #[macro_export] macro_rules! assert_option_subset { - ($left:expr, $right:expr $(,)?) => { + ($left:expr, $right:expr $(,)?) => {{ + use $crate::option::OptionSubset; match (&$left, &$right) { (left_val, right_val) => { if !(left_val.option_subset(right_val)) { @@ -49,12 +50,13 @@ right: {right_val:?}"# } } } - }; + }}; } #[macro_export] macro_rules! assert_not_option_subset { - ($left:expr, $right:expr $(,)?) => { + ($left:expr, $right:expr $(,)?) => {{ + use $crate::option::OptionSubset; match (&$left, &$right) { (left_val, right_val) => { if left_val.option_subset(right_val) { @@ -66,7 +68,7 @@ right: {right_val:?}"# } } } - }; + }}; } impl OptionSubset for Option @@ -682,7 +684,6 @@ mod tests { #[cfg(feature = "serde_json")] mod serde_json { - use super::*; use crate::serde_json::json; use crate::serde_json::value::{Map, Value}; diff --git a/tools/api-coverage/src/main.rs b/tools/api-coverage/src/main.rs index 760a4864..89115ca6 100644 --- a/tools/api-coverage/src/main.rs +++ b/tools/api-coverage/src/main.rs @@ -18,7 +18,7 @@ use sys::SysDefs; #[command(version, about, long_about = None)] struct Args { /// The name of the API crate - #[arg(short, long, default_value_t = String::from("tiledb"))] + #[arg(short, long, default_value_t = String::from("tiledb-api"))] api: String, /// Path to the bindgen generated APIs