Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[r] Move parse_query_condition_new -> parse_query_condition #3174

Merged
merged 5 commits into from
Oct 16, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion apis/r/DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ Description: Interface for working with 'TileDB'-based Stack of Matrices,
like those commonly used for single cell data analysis. It is documented at
<https://github.com/single-cell-data>; a formal specification available is at
<https://github.com/single-cell-data/SOMA/blob/main/abstract_specification.md>.
Version: 1.15.99.4
Version: 1.15.99.5
Authors@R: c(
person(given = "Aaron", family = "Wolen",
role = c("cre", "aut"), email = "aaron@tiledb.com",
Expand Down
3 changes: 2 additions & 1 deletion apis/r/NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,8 @@
* Handle `numeric` coords properly when reading arrays
* Remove two more `tiledb::schema` callsites [#3160](https://github.com/single-cell-data/TileDB-SOMA/pull/3160)
* Add new Arrow-to-R type mapper
* Add transitiona/non-exported `parse_query_condition_new` [#3162](https://github.com/single-cell-data/TileDB-SOMA/pull/3162)
* Add transitional/non-exported `parse_query_condition_new` [#3162](https://github.com/single-cell-data/TileDB-SOMA/pull/3162)
* Apply new `parse_query_condition` [#3174](https://github.com/single-cell-data/TileDB-SOMA/pull/3174)

# tiledbsoma 1.14.1

Expand Down
76 changes: 66 additions & 10 deletions apis/r/R/QueryCondition.R
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@
#' Expressions, in the R language syntax, are parsed locally by this function.
#'
#' @param expr An expression that is understood by the TileDB grammar for
#' query conditions.
#' query conditions, as a character string.
#'
#' @param schema The Arrow schema for the array for which a query
#' condition is being prepared. This is necessary to obtain type information
Expand All @@ -57,7 +57,11 @@ parse_query_condition_new <- function(
somactx
) {

spdl::debug("[parseqc] ENTER [{}]", expr)

stopifnot(
"The expr argument must be a single character string" =
is(expr, "character") && length(expr) == 1,
"The schema argument must be an Arrow Schema" =
is(schema, "ArrowObject") &&
is(schema, "Schema"),
Expand Down Expand Up @@ -164,17 +168,66 @@ parse_query_condition_new <- function(
arrow_type_name <- "utf8"
}

if (arrow_type_name == "timestamp") {
unit <- arrow_field$type$unit()
if (unit == 0) {
arrow_type_name <- "timestamp_s"
} else if (unit == 1) {
arrow_type_name <- "timestamp_ms"
} else if (unit == 2) {
arrow_type_name <- "timestamp_us"
} else if (unit == 3) {
arrow_type_name <- "timestamp_ns"
} else {
.error_function(
"Attribute '", attr_name, "' has unknown unit ",
arrow_field$type$unit, call. = FALSE)
}
}

value = switch(
arrow_type_name,
ascii = rhs_text,
string = rhs_text,
utf8 = rhs_text,
large_utf8 = rhs_text,
bool = as.logical(rhs_text),
# Problem:

# > t <-as.POSIXct('1970-01-01 01:00:05 UTC')
# > as.numeric(t)
# [1] 21605
# > ?as.POSIXct
# > t <-as.POSIXct('1970-01-01 01:00:05 EST')
# > as.numeric(t)
# [1] 21605
# > t <-as.POSIXct('1970-01-01 01:00:05 UTC', tz="EST")
# > as.numeric(t)
# [1] 21605
# > t <-as.POSIXct('1970-01-01 01:00:05 UTC', tz="UTC")
# > as.numeric(t)
# [1] 3605

# It's not respecting the timezone given in the first argument string.
# Not good.

timestamp_s = as.numeric(as.POSIXct(rhs_text, tz="UTC")), # THIS NEEDS THOUGHT
timestamp_ms = as.numeric(as.POSIXct(rhs_text, tz="UTC")), # THIS NEEDS THOUGHT
timestamp_ns = as.numeric(as.POSIXct(rhs_text, tz="UTC")), # THIS NEEDS THOUGHT
timestamp_us = as.numeric(as.POSIXct(rhs_text, tz="UTC")), # THIS NEEDS THOUGHT
date32 = as.Date(rhs_text),
as.numeric(rhs_text))

spdl::debug("[parseqc] triple name:[{}] value:[{}] type:[{}] op:[{}]",
attr_name,
value,
arrow_type_name,
op_name);

# General case of extracting appropriate value given type info
return(tiledbsoma_query_condition_from_triple(
attr_name = attr_name,
value = switch(
arrow_type_name,
ascii = rhs_text,
utf8 = rhs_text,
bool = as.logical(rhs_text),
date32 = as.POSIXct(rhs_text),
timestamp = as.Date(rhs_text),
as.numeric(rhs_text)),
value = value,
arrow_type_name = arrow_type_name,
op_name = .map_op_to_character(op_name),
qc = tiledbsoma_empty_query_condition(somactx)))
Expand All @@ -184,8 +237,11 @@ parse_query_condition_new <- function(
}
}

# Convert expr from string to language
aslang <- str2lang(expr)

# Use base-r `substitute` to map the user-provided expression to a parse tree
parse_tree <- substitute(expr)
parse_tree <- substitute(aslang)

# Map the parse tree to TileDB core QueryCondition
return(.parse_tree_to_qc(parse_tree, debug))
Expand Down
7 changes: 3 additions & 4 deletions apis/r/R/SOMADataFrame.R
Original file line number Diff line number Diff line change
Expand Up @@ -178,8 +178,6 @@ SOMADataFrame <- R6::R6Class(
private$check_open_for_read()

result_order <- match_query_layout(result_order)
uri <- self$uri
arr <- self$object # need array (schema) to properly parse query condition

## if unnamed set names
if (!is.null(coords)) {
Expand All @@ -199,8 +197,9 @@ SOMADataFrame <- R6::R6Class(

if (!is.null(value_filter)) {
value_filter <- validate_read_value_filter(value_filter)
parsed <- do.call(what = tiledb::parse_query_condition,
args = list(expr = str2lang(value_filter), ta = arr))
parsed <- do.call(
what = parse_query_condition,
args = list(expr = value_filter, schema = self$schema(), somactx = private$.soma_context))
value_filter <- parsed@ptr
}
spdl::debug("[SOMADataFrame$read] calling sr_setup for {} at ({},{})", self$uri,
Expand Down
27 changes: 24 additions & 3 deletions apis/r/src/query_condition.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -130,7 +130,9 @@ void libtiledbsoma_query_condition_from_triple(
uint64_t cond_val_size = sizeof(float);
query_cond->init(attr_name, (void*)&v, cond_val_size, op);

} else if (arrow_type_name == "utf8" || arrow_type_name == "large_utf8") {
} else if (
arrow_type_name == "string" || arrow_type_name == "ascii" ||
arrow_type_name == "utf8" || arrow_type_name == "large_utf8") {
std::string v = Rcpp::as<std::string>(condition_value);
query_cond->init(attr_name, v, op);

Expand All @@ -139,13 +141,32 @@ void libtiledbsoma_query_condition_from_triple(
uint64_t cond_val_size = sizeof(bool);
query_cond->init(attr_name, (void*)&v, cond_val_size, op);

} else if (arrow_type_name == "timestamp") {
// Arrow timestamp TileDB DATETIME_MS
} else if (arrow_type_name == "timestamp_s") {
int64_t v = static_cast<int64_t>(
Rcpp::as<double>(condition_value));
spdl::debug("ts3 {}", v);
uint64_t cond_val_size = sizeof(int64_t);
query_cond->init(attr_name, (void*)&v, cond_val_size, op);

} else if (arrow_type_name == "timestamp_ms") {
int64_t v = static_cast<int64_t>(
Rcpp::as<double>(condition_value) * 1000);
uint64_t cond_val_size = sizeof(int64_t);
query_cond->init(attr_name, (void*)&v, cond_val_size, op);

} else if (arrow_type_name == "timestamp_us") {
int64_t v = static_cast<int64_t>(
Rcpp::as<double>(condition_value) * 1e6);
uint64_t cond_val_size = sizeof(int64_t);
query_cond->init(attr_name, (void*)&v, cond_val_size, op);

} else if (arrow_type_name == "timestamp_ns") {
// XXX nanotime ...
int64_t v = static_cast<int64_t>(
Rcpp::as<double>(condition_value) * 1e9);
uint64_t cond_val_size = sizeof(int64_t);
query_cond->init(attr_name, (void*)&v, cond_val_size, op);

} else if (arrow_type_name == "date32") {
// Arrow date32 TileDB DATETIME_DAY
int64_t v = static_cast<int64_t>(Rcpp::as<double>(condition_value));
Expand Down
2 changes: 2 additions & 0 deletions apis/r/tests/testthat/test-SCEOutgest.R
Original file line number Diff line number Diff line change
Expand Up @@ -335,6 +335,7 @@ test_that("Load SCE object from indexed ExperimentQuery", {
skip_if(!extended_tests() || covr_tests())
skip_if_not_installed('SingleCellExperiment', .MINIMUM_SCE_VERSION('c'))
uri <- tempfile(pattern="sce-experiment-query-value-filters")

n_obs <- 1001L
n_var <- 99L
n_pcs <- 50L
Expand Down Expand Up @@ -371,6 +372,7 @@ test_that("Load SCE object from indexed ExperimentQuery", {
n_var_select <- length(var_label_values)
n_obs_select <- length(obs_label_values)
expect_no_condition(obj <- query$to_single_cell_experiment())

expect_s4_class(obj, 'SingleCellExperiment')
expect_identical(dim(obj), c(n_var_select, n_obs_select))
expect_identical(
Expand Down
23 changes: 7 additions & 16 deletions apis/r/tests/testthat/test-SOMAArrayReader-Arrow.R
Original file line number Diff line number Diff line change
Expand Up @@ -15,33 +15,24 @@ test_that("Arrow Interface from SOMAArrayReader", {
tb1 <- soma_array_to_arrow_table(soma_array_reader(uri, columns))
expect_equal(tb1$num_rows, 2638)

arr <- tiledb_array(uri) # need array for schema access to qc parser
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

already tested in tests/testthat/test-query-condition.R

qc <- parse_query_condition(n_counts < 1000 && n_genes >= 400, ta = arr)
tb2 <- soma_array_to_arrow_table(soma_array_reader(uri, columns, qc@ptr))

expect_equal(tb2$num_rows, 47)
expect_true(all(tb2$n_counts < 1000))
expect_true(all(tb2$n_genes >= 400))


# read everything
tb3 <- soma_array_to_arrow_table(soma_array_reader(uri))
tb2 <- soma_array_to_arrow_table(soma_array_reader(uri))

expect_equal(tb3$num_rows, 2638)
expect_equal(tb3$num_columns, 6)
expect_equal(tb2$num_rows, 2638)
expect_equal(tb2$num_columns, 6)

# read a subset of rows and columns
tb4 <- soma_array_to_arrow_table(soma_array_reader(uri = uri,
tb3 <- soma_array_to_arrow_table(soma_array_reader(uri = uri,
colnames = c("obs_id", "percent_mito", "n_counts", "louvain"),
dim_ranges = list(soma_joinid = rbind(bit64::as.integer64(c(1000, 1004)),
bit64::as.integer64(c(2000, 2004)))),
dim_points=list(soma_joinid = bit64::as.integer64(seq(0, 100, by = 20)))))


expect_equal(tb4$num_rows, 16)
expect_equal(tb4$num_columns, 4)
expect_equal(tb3$num_rows, 16)
expect_equal(tb3$num_columns, 4)

rm(z, tb, rb, tb1, arr, tb2, tb3, tb4)
rm(z, tb, rb, tb1, tb2, tb3)
gc()
})

Expand Down
Loading
Loading