From e14c3289f65809fbf5dcd4513954e0bbf92a1b2b Mon Sep 17 00:00:00 2001 From: cafreeman Date: Tue, 3 Mar 2015 16:04:55 -0600 Subject: [PATCH 1/5] `selectExpr` --- pkg/NAMESPACE | 1 + pkg/R/DataFrame.R | 29 +++++++++++++++ pkg/inst/tests/test_sparkSQL.R | 67 ++++++++++++++++++++-------------- 3 files changed, 69 insertions(+), 28 deletions(-) diff --git a/pkg/NAMESPACE b/pkg/NAMESPACE index 26d0690a3cac0..a2831f04c5a26 100644 --- a/pkg/NAMESPACE +++ b/pkg/NAMESPACE @@ -100,6 +100,7 @@ exportMethods("columns", "schema", "sortDF", "select", + "selectExpr", "toRDD", "where") diff --git a/pkg/R/DataFrame.R b/pkg/R/DataFrame.R index 8da89e54f00bf..8daae8d694cf4 100644 --- a/pkg/R/DataFrame.R +++ b/pkg/R/DataFrame.R @@ -607,6 +607,35 @@ setMethod("select", signature(x = "DataFrame", col = "Column"), dataFrame(sdf) }) + +#' SelectExpr +#' +#' Select from a DataFrame using a set of SQL expressions. +#' +#' @param x A DataFrame to be sorted. +#' @param expr A string containing a SQL expression +#' @param ... Additional expressions +#' @return A DataFrame +#' @rdname selectExpr +#' @export +#' @examples +#'\dontrun{ +#' sc <- sparkR.init() +#' sqlCtx <- sparkRSQL.init(sc) +#' path <- "path/to/file.json" +#' df <- jsonFile(sqlCtx, path) +#' selectExpr(df, "col1", "(col2 * 5) as newCol") +#' } +setGeneric("selectExpr", function(x, expr, ...) { standardGeneric("selectExpr") }) + +setMethod("selectExpr", + signature(x = "DataFrame", expr = "character"), + function(x, expr, ...) { + exprList <- list(expr, ...) + sdf <- callJMethod(x@sdf, "selectExpr", listToSeq(exprList)) + dataFrame(sdf) + }) + #' SortDF #' #' Sort a DataFrame by the specified column(s). diff --git a/pkg/inst/tests/test_sparkSQL.R b/pkg/inst/tests/test_sparkSQL.R index 26450eca63dd1..68ef8d0dc56a3 100644 --- a/pkg/inst/tests/test_sparkSQL.R +++ b/pkg/inst/tests/test_sparkSQL.R @@ -231,6 +231,17 @@ test_that("select with column", { expect_true(count(df2) == 3) }) +test_that("selectExpr() on a DataFrame", { + df <- jsonFile(sqlCtx, jsonPath) + selected <- selectExpr(df, "age * 2") + expect_true(names(selected) == "age * 2") + expect_equal(collect(selected), collect(select(df, df$age * 2L))) + + selected2 <- selectExpr(df, "name as newName", "abs(age) as age") + expect_equal(names(selected2), c("newName", "age")) + expect_true(count(selected2) == 3) +}) + test_that("column calculation", { df <- jsonFile(sqlCtx, jsonPath) d <- collect(select(df, alias(df$age + 1, "age2"))) @@ -282,34 +293,34 @@ test_that("filter() on a DataFrame", { }) test_that("join() on a DataFrame", { -df <- jsonFile(sqlCtx, jsonPath) - -mockLines2 <- c("{\"name\":\"Michael\", \"test\": \"yes\"}", - "{\"name\":\"Andy\", \"test\": \"no\"}", - "{\"name\":\"Justin\", \"test\": \"yes\"}", - "{\"name\":\"Bob\", \"test\": \"yes\"}") -jsonPath2 <- tempfile(pattern="sparkr-test", fileext=".tmp") -writeLines(mockLines2, jsonPath2) -df2 <- jsonFile(sqlCtx, jsonPath2) - -joined <- join(df, df2) -expect_equal(names(joined), c("age", "name", "name", "test")) -expect_true(count(joined) == 12) - -joined2 <- join(df, df2, df$name == df2$name) -expect_equal(names(joined2), c("age", "name", "name", "test")) -expect_true(count(joined2) == 3) - -joined3 <- join(df, df2, df$name == df2$name, "right_outer") -expect_equal(names(joined3), c("age", "name", "name", "test")) -expect_true(count(joined3) == 4) -expect_true(is.na(collect(joined3)$age[4])) - -joined4 <- select(join(df, df2, df$name == df2$name, "outer"), - alias(df$age + 5, "newAge"), df$name, df2$test) -expect_equal(names(joined4), c("newAge", "name", "test")) -expect_true(count(joined4) == 4) -expect_true(first(joined4)$newAge == 24) + df <- jsonFile(sqlCtx, jsonPath) + + mockLines2 <- c("{\"name\":\"Michael\", \"test\": \"yes\"}", + "{\"name\":\"Andy\", \"test\": \"no\"}", + "{\"name\":\"Justin\", \"test\": \"yes\"}", + "{\"name\":\"Bob\", \"test\": \"yes\"}") + jsonPath2 <- tempfile(pattern="sparkr-test", fileext=".tmp") + writeLines(mockLines2, jsonPath2) + df2 <- jsonFile(sqlCtx, jsonPath2) + + joined <- join(df, df2) + expect_equal(names(joined), c("age", "name", "name", "test")) + expect_true(count(joined) == 12) + + joined2 <- join(df, df2, df$name == df2$name) + expect_equal(names(joined2), c("age", "name", "name", "test")) + expect_true(count(joined2) == 3) + + joined3 <- join(df, df2, df$name == df2$name, "right_outer") + expect_equal(names(joined3), c("age", "name", "name", "test")) + expect_true(count(joined3) == 4) + expect_true(is.na(collect(joined3)$age[4])) + + joined4 <- select(join(df, df2, df$name == df2$name, "outer"), + alias(df$age + 5, "newAge"), df$name, df2$test) + expect_equal(names(joined4), c("newAge", "name", "test")) + expect_true(count(joined4) == 4) + expect_true(first(joined4)$newAge == 24) }) unlink(jsonPath) From 494a4ddef3bc39a8caa848c931033e1a6f6911bf Mon Sep 17 00:00:00 2001 From: cafreeman Date: Tue, 3 Mar 2015 16:06:06 -0600 Subject: [PATCH 2/5] update export --- pkg/R/DataFrame.R | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pkg/R/DataFrame.R b/pkg/R/DataFrame.R index 8daae8d694cf4..b8b118da0e6d2 100644 --- a/pkg/R/DataFrame.R +++ b/pkg/R/DataFrame.R @@ -628,6 +628,8 @@ setMethod("select", signature(x = "DataFrame", col = "Column"), #' } setGeneric("selectExpr", function(x, expr, ...) { standardGeneric("selectExpr") }) +#' @rdname selectExpr +#' @export setMethod("selectExpr", signature(x = "DataFrame", expr = "character"), function(x, expr, ...) { From acea1468a9c71ea5d711e0a6845b79d4ad6c8b21 Mon Sep 17 00:00:00 2001 From: cafreeman Date: Tue, 3 Mar 2015 16:20:55 -0600 Subject: [PATCH 3/5] remove extra line --- pkg/R/DataFrame.R | 1 - 1 file changed, 1 deletion(-) diff --git a/pkg/R/DataFrame.R b/pkg/R/DataFrame.R index b8b118da0e6d2..09adfce07f5ad 100644 --- a/pkg/R/DataFrame.R +++ b/pkg/R/DataFrame.R @@ -607,7 +607,6 @@ setMethod("select", signature(x = "DataFrame", col = "Column"), dataFrame(sdf) }) - #' SelectExpr #' #' Select from a DataFrame using a set of SQL expressions. From 79186340c2095ba6ec32b06ebb1a3daffd72b486 Mon Sep 17 00:00:00 2001 From: cafreeman Date: Tue, 3 Mar 2015 16:22:31 -0600 Subject: [PATCH 4/5] Fix test --- pkg/inst/tests/test_sparkSQL.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/inst/tests/test_sparkSQL.R b/pkg/inst/tests/test_sparkSQL.R index 68ef8d0dc56a3..539fde06e0108 100644 --- a/pkg/inst/tests/test_sparkSQL.R +++ b/pkg/inst/tests/test_sparkSQL.R @@ -234,7 +234,7 @@ test_that("select with column", { test_that("selectExpr() on a DataFrame", { df <- jsonFile(sqlCtx, jsonPath) selected <- selectExpr(df, "age * 2") - expect_true(names(selected) == "age * 2") + expect_true(names(selected) == "(age * 2)") expect_equal(collect(selected), collect(select(df, df$age * 2L))) selected2 <- selectExpr(df, "name as newName", "abs(age) as age") From bc901159e30ef3bebb2b8a7a36c9abfb3b1cf230 Mon Sep 17 00:00:00 2001 From: cafreeman Date: Wed, 4 Mar 2015 08:33:00 -0600 Subject: [PATCH 5/5] Fixed docs --- pkg/R/DataFrame.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/R/DataFrame.R b/pkg/R/DataFrame.R index 09adfce07f5ad..704470cd5210c 100644 --- a/pkg/R/DataFrame.R +++ b/pkg/R/DataFrame.R @@ -611,7 +611,7 @@ setMethod("select", signature(x = "DataFrame", col = "Column"), #' #' Select from a DataFrame using a set of SQL expressions. #' -#' @param x A DataFrame to be sorted. +#' @param x A DataFrame to be selected from. #' @param expr A string containing a SQL expression #' @param ... Additional expressions #' @return A DataFrame