Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement data.table-like j argument, without evalutation #37

Merged
merged 3 commits into from
May 7, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .travis.yml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
language: R
cache: packages
sudo: false
warnings_are_errors: true
warnings_are_errors: false

r:
- oldrel
Expand Down
1 change: 1 addition & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ export(table_proxy_read_full)
export(table_proxy_read_range)
export(table_proxy_select_row_mask)
export(table_proxy_select_rows)
export(table_proxy_transform)
import(data.table)
import(fst)
importFrom(utils,capture.output)
Expand Down
12 changes: 6 additions & 6 deletions R/data_table_interface.R
Original file line number Diff line number Diff line change
Expand Up @@ -158,17 +158,17 @@ data_table_interface <- function(table_proxy) {

# select rows with an integer row index
tbl_proxy <- table_proxy_select_rows(tbl_proxy, i)

# return a copy of the interface with new table proxy
return(data_table_interface(tbl_proxy))
}

if (!missing(j)) {
if (verbose) print("j used")

jsub <- parse_j(substitute(j))
colexps <- parse_j(j, tbl_proxy$remotetablestate$colnames, parent.frame())

# return full table, implement later
return(data_table_interface(tbl_proxy))
# update the column expressions and names
tbl_proxy <- table_proxy_transform(tbl_proxy, colexps)
}

# return a copy of the interface with new table proxy
return(data_table_interface(tbl_proxy))
}
4 changes: 4 additions & 0 deletions R/data_table_interface_generics.R
Original file line number Diff line number Diff line change
Expand Up @@ -149,6 +149,9 @@ print.datatableinterface <- function(x, number_of_rows = 50, ...) {
}
}

# set column names
setnames(sample_data, col_names)

# use color in terminal output
color_on <- TRUE

Expand Down Expand Up @@ -198,6 +201,7 @@ print.datatableinterface <- function(x, number_of_rows = 50, ...) {
gray_rows <- c(type_rows, gray_rows)
} else {
# table is not splitted along the row axis

sample_data_print <- rbind(
type_row,
sample_data_print)
Expand Down
25 changes: 20 additions & 5 deletions R/data_table_interface_j.R
Original file line number Diff line number Diff line change
Expand Up @@ -66,10 +66,25 @@ call_parser <- function(jsub, parent_frame, table_columns, indentation = 0) {
}


parse_j <- function(j, table_columns) {
jsub <- substitute(j)
if (is.call(jsub)) {
result <- call_parser(jsub, parent.frame(), table_columns)
return(result)
parse_j <- function(j, table_columns, parent_frame) {

jsub <- substitute(j, parent_frame)
jsub <- replace_dot_alias(jsub)

if (is.name(jsub) || !(jsub[[1]] == "list")) {
stop("j must be a list")
}

colexps <- as.list(jsub[-1])

if (is.null(names(colexps))) {
names(colexps) <- rep("", length(colexps))
}

no_name <- names(colexps) == ""
expr_is_name <- sapply(colexps, is.name)
names(colexps)[no_name & expr_is_name] <- colexps[no_name & expr_is_name]
names(colexps)[no_name & !expr_is_name] <- paste0("V", which(no_name & !expr_is_name))

return(colexps)
}
5 changes: 3 additions & 2 deletions R/table_proxy.R
Original file line number Diff line number Diff line change
Expand Up @@ -48,8 +48,9 @@ table_proxy <- function(remote_table) {
coltypes = rtable_column_types(remote_table),
nrow = rtable_nrow(remote_table),
ncol = length(proxy_colnames),
slice_map = NULL, # all rows initialy selected
slice_map_ordered = TRUE
slice_map = NULL, # all rows initially selected
slice_map_ordered = TRUE,
colexps = sapply(proxy_colnames, as.symbol)
)

.table_proxy(remote_table, remote_table_state)
Expand Down
155 changes: 108 additions & 47 deletions R/table_proxy_methods.R
Original file line number Diff line number Diff line change
Expand Up @@ -63,37 +63,41 @@ table_proxy_read_range <- function(tbl_proxy, from_row, to_row, col_names = NULL
rtable_state <- tbl_proxy$remotetablestate

# determine columns to read
cols <- rtable_state$colnames

if (!is.null(col_names)) {
if (sum(!(col_names %in% cols) != 0)) {
stop("Unknown columns requested")
}

cols <- col_names
}
on_disk <- sapply(rtable_state$colexps, function(expr) {
is.name(expr) && (as.character(expr) %in% rtable_colnames(rtable))
})
cols_to_read <- as.character(sapply(rtable_state$colexps[on_disk], deparse))

# determine rows to read
slice_map <- rtable_state$slice_map

# no previous slice map, use arguments for row subset
if (is.null(slice_map)) {
return(rtable_read_range(rtable, from_row, to_row, cols))
}
# no previous slice map, use arguments for row subset
result <- rtable_read_range(rtable, from_row, to_row, cols_to_read)
} else {
# calculate new slice map
slice_map <- slice_map[from_row:to_row]

# calculate new slice map
slice_map <- slice_map[from_row:to_row]
# order slice map and read row subset
min_row <- min(slice_map)
max_row <- max(slice_map)

# order slice map and read row subset
min_row <- min(slice_map)
max_row <- max(slice_map)
# very inefficient partial read
# TODO: create a read mask
row_range <- rtable_read_range(rtable, min_row, max_row, cols)

# very inefficient partial read
# TODO: create a read mask
row_range <- rtable_read_range(rtable, min_row, max_row, cols)
# read contiguous extent of selected rows, then filter & order
row_range[1 + slice_map - min_row, , drop = FALSE]

# read contiguous extent of selected rows, then filter & order
row_range[1 + slice_map - min_row, , drop = FALSE]
}

if (ncol(result) > 0) {
setnames(result, rtable_state$colnames[on_disk])
}

result <- .add_virtual_cols(result, rtable_state, on_disk, to_row - from_row + 1)

return(result)
}


Expand All @@ -113,45 +117,45 @@ table_proxy_read_full <- function(tbl_proxy, col_names = NULL) {
rtable_state <- tbl_proxy$remotetablestate

# determine columns to read
cols <- rtable_state$colnames

if (!is.null(col_names)) {
if (sum(!(col_names %in% cols) != 0)) {
stop("Unknown columns requested")
}

cols <- col_names
}
on_disk <- sapply(rtable_state$colexps, function(expr) {
is.name(expr) && (as.character(expr) %in% rtable_colnames(rtable))
})
cols_to_read <- as.character(sapply(rtable_state$colexps[on_disk], deparse))

# determine rows to read
slice_map <- rtable_state$slice_map

# read all rows
if (is.null(slice_map)) {
return(rtable_read_full(rtable, cols))
}
# read all rows
result <- rtable_read_full(rtable, cols_to_read)
} else if (length(slice_map) == 0) {
# empty table
result <- rtable_read_range(rtable, 1, 2, cols_to_read)
} else {
# order slice map and read row subset
min_row <- min(slice_map)
max_row <- max(slice_map)

# very inefficient partial read
# TODO: create a read mask
result <- rtable_read_range(rtable, min_row, max_row, cols_to_read)

# empty table
if (length(slice_map) == 0) {
first_row <- rtable_read_range(rtable, 1, 2, cols)
return(first_row[c(FALSE, FALSE)])
# read contiguous extent of selected rows, then filter & order
result <- result[1 + slice_map - min_row, , drop = FALSE]
}

# order slice map and read row subset
min_row <- min(slice_map)
max_row <- max(slice_map)
if (ncol(result) > 0) {
setnames(result, rtable_state$colnames[on_disk])
}

# very inefficient partial read
# TODO: create a read mask
row_range <- rtable_read_range(rtable, min_row, max_row, cols)
result <- .add_virtual_cols(result, rtable_state, on_disk, rtable_state$nrow)

# read contiguous extent of selected rows, then filter & order
row_range[1 + slice_map - min_row, , drop = FALSE]
return(result)
}


#' Apply a binary row-selection operation on the current table_proxy state.
#' This operation will not change the slice map ordering
#' This operation will not change the slice map ordering

#'
#' @param tbl_proxy a table proxy object
Expand Down Expand Up @@ -235,3 +239,60 @@ table_proxy_select_rows <- function(tbl_proxy, i) {

tbl_proxy
}


#' Apply a column transformation operation on the current table_proxy state
#'
#' Currently this is intended to be used with the j argument to a datatableinterface
#' object, but it could just as well be used to implement mutate or transmute for a
#' dplyr interface.
#'
#' @param tbl_proxy a table proxy object
#' @param colexps column expressions
#'
#' @return a table proxy object with the new state
#' @export
table_proxy_transform <- function(tbl_proxy, colexps) {

# update nrow
tbl_proxy$remotetablestate$ncol <- length(colexps)
tbl_proxy$remotetablestate$colnames <- names(colexps)
tbl_proxy$remotetablestate$colexps <-
lapply(colexps, .resubstitute, sub = tbl_proxy$remotetablestate$colexps)

# probably best to infer the column type from the expression result as type
# casting can occur when a more complex expression is used
tbl_proxy$remotetablestate$coltypes <-
rtable_column_types(tbl_proxy$remotetable)[match(tbl_proxy$remotetablestate$colexps,
rtable_colnames(tbl_proxy$remotetable))]

tbl_proxy
}


# Like substitute, except it can substitute one expression into an expression stored in a variable
.resubstitute <- function(expr, sub) {
eval(substitute(substitute(.expr, sub), list(.expr = expr)))
}


# This is just a temporary hack for printing the unevaluated virtual column expressions:
# Perhaps this is the point where the actual evaluating code would be called.
.add_virtual_cols <- function(tbl, rtable_state, on_disk, n_row) {

if (all(on_disk)) {
return(tbl)
}

virt_cols <- as.data.frame(lapply(rtable_state$colexps[which(!on_disk)], function(expr) {
rep(deparse(expr), n_row)
}))

if (any(on_disk)) {
tbl <- cbind(tbl, virt_cols)[, rtable_state$colnames, drop = FALSE]
} else {
tbl <- virt_cols[, rtable_state$colnames, drop = FALSE]
}

return(tbl)
}
21 changes: 21 additions & 0 deletions man/table_proxy_transform.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

26 changes: 19 additions & 7 deletions tests/testthat/test_interface.R
Original file line number Diff line number Diff line change
Expand Up @@ -17,15 +17,11 @@ fst::write_fst(x, "1.fst")
ft <- fsttable::fst_table("1.fst")


# collect the ft and compares the result to the in-memory data.table
check_interface <- function(ft, dt, i, j) {
expect_equal(ft[i, j, collect = TRUE], dt[i, j])
}

expect_identical_table_proxy <- function(ft1, ft2) {
expect_identical(fsttable:::.get_table_proxy(ft1), fsttable:::.get_table_proxy(ft2))
}


test_that("empty i and j", {
ft_copy <- ft[]
expect_identical_table_proxy(ft, ft_copy)
Expand All @@ -35,10 +31,11 @@ test_that("empty i and j", {
test_that("row selection", {

# integer selection
check_interface(ft, x, 1:10)
expect_equal(ft[1:10, collect = TRUE], x[1:10, ])

# logical selection
check_interface(ft, x, rep(c(FALSE, TRUE, TRUE, FALSE), 25))
mask <- rep(c(FALSE, TRUE, TRUE, FALSE), 25)
expect_equal(ft[mask, collect = TRUE], x[mask, ])

# equivalent selection results in same structure
ft1 <- ft[5:1]
Expand All @@ -51,5 +48,20 @@ test_that("row selection", {
# full selection should return identical table
expect_identical_table_proxy(ft, ft[1:nrow(ft)])
expect_identical_table_proxy(ft, ft[rep(TRUE, nrow(ft))])
})


test_that("row selection", {

# dot single column selection
expect_equal(ft[, .(Z = X), collect = TRUE], x[, .(Z = X)])

# dot multiple identical column selection
expect_equal(ft[, .(Z = X, W = X), collect = TRUE], x[, .(Z = X, W = X)])

# dot multiple identical column selection
expect_equal(ft[, .(Z = X, W = X), collect = TRUE], x[, .(Z = X, W = X)])

# column name override
expect_equal(ft[, .(X = Y), collect = TRUE], x[, .(X = Y)])
})