Merge pull request apache#253 from palantir/resync-apache

mccheah · Sep 15, 2017 · 105e1b6 · 105e1b6
2 parents 72d0f1e + 388cb48
commit 105e1b6
Show file tree

Hide file tree

Showing 538 changed files with 15,328 additions and 5,937 deletions.
diff --git a/R/pkg/DESCRIPTION b/R/pkg/DESCRIPTION
@@ -2,7 +2,7 @@ Package: SparkR
 Type: Package
 Version: 2.3.0
 Title: R Frontend for Apache Spark
-Description: The SparkR package provides an R Frontend for Apache Spark.
+Description: Provides an R Frontend for Apache Spark.
 Authors@R: c(person("Shivaram", "Venkataraman", role = c("aut", "cre"),
                     email = "shivaram@cs.berkeley.edu"),
              person("Xiangrui", "Meng", role = "aut",

diff --git a/R/pkg/NAMESPACE b/R/pkg/NAMESPACE
@@ -169,6 +169,7 @@ exportMethods("arrange",
               "transform",
               "union",
               "unionAll",
+              "unionByName",
               "unique",
               "unpersist",
               "where",

diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R
@@ -2683,7 +2683,7 @@ generateAliasesForIntersectedCols <- function (x, intersectedColNames, suffix) {
 #' @rdname union
 #' @name union
 #' @aliases union,SparkDataFrame,SparkDataFrame-method
-#' @seealso \link{rbind}
+#' @seealso \link{rbind} \link{unionByName}
 #' @export
 #' @examples
 #'\dontrun{
@@ -2714,6 +2714,40 @@ setMethod("unionAll",
             union(x, y)
           })
 
+#' Return a new SparkDataFrame containing the union of rows, matched by column names
+#'
+#' Return a new SparkDataFrame containing the union of rows in this SparkDataFrame
+#' and another SparkDataFrame. This is different from \code{union} function, and both
+#' \code{UNION ALL} and \code{UNION DISTINCT} in SQL as column positions are not taken
+#' into account. Input SparkDataFrames can have different data types in the schema.
+#'
+#' Note: This does not remove duplicate rows across the two SparkDataFrames.
+#' This function resolves columns by name (not by position).
+#'
+#' @param x A SparkDataFrame
+#' @param y A SparkDataFrame
+#' @return A SparkDataFrame containing the result of the union.
+#' @family SparkDataFrame functions
+#' @rdname unionByName
+#' @name unionByName
+#' @aliases unionByName,SparkDataFrame,SparkDataFrame-method
+#' @seealso \link{rbind} \link{union}
+#' @export
+#' @examples
+#'\dontrun{
+#' sparkR.session()
+#' df1 <- select(createDataFrame(mtcars), "carb", "am", "gear")
+#' df2 <- select(createDataFrame(mtcars), "am", "gear", "carb")
+#' head(unionByName(df1, df2))
+#' }
+#' @note unionByName since 2.3.0
+setMethod("unionByName",
+          signature(x = "SparkDataFrame", y = "SparkDataFrame"),
+          function(x, y) {
+            unioned <- callJMethod(x@sdf, "unionByName", y@sdf)
+            dataFrame(unioned)
+          })
+
 #' Union two or more SparkDataFrames
 #'
 #' Union two or more SparkDataFrames by row. As in R's \code{rbind}, this method
@@ -2730,7 +2764,7 @@ setMethod("unionAll",
 #' @aliases rbind,SparkDataFrame-method
 #' @rdname rbind
 #' @name rbind
-#' @seealso \link{union}
+#' @seealso \link{union} \link{unionByName}
 #' @export
 #' @examples
 #'\dontrun{
@@ -2930,7 +2964,7 @@ setMethod("saveAsTable",
             invisible(callJMethod(write, "saveAsTable", tableName))
           })
 
-#' summary
+#' describe
 #'
 #' Computes statistics for numeric and string columns.
 #' If no columns are given, this function computes statistics for all numerical or string columns.
@@ -2941,7 +2975,7 @@ setMethod("saveAsTable",
 #' @return A SparkDataFrame.
 #' @family SparkDataFrame functions
 #' @aliases describe,SparkDataFrame,character-method describe,SparkDataFrame,ANY-method
-#' @rdname summary
+#' @rdname describe
 #' @name describe
 #' @export
 #' @examples
@@ -2953,6 +2987,7 @@ setMethod("saveAsTable",
 #' describe(df, "col1")
 #' describe(df, "col1", "col2")
 #' }
+#' @seealso See \link{summary} for expanded statistics and control over which statistics to compute.
 #' @note describe(SparkDataFrame, character) since 1.4.0
 setMethod("describe",
           signature(x = "SparkDataFrame", col = "character"),
@@ -2962,7 +2997,7 @@ setMethod("describe",
             dataFrame(sdf)
           })
 
-#' @rdname summary
+#' @rdname describe
 #' @name describe
 #' @aliases describe,SparkDataFrame-method
 #' @note describe(SparkDataFrame) since 1.4.0
@@ -2973,15 +3008,50 @@ setMethod("describe",
             dataFrame(sdf)
           })
 
+#' summary
+#'
+#' Computes specified statistics for numeric and string columns. Available statistics are:
+#' \itemize{
+#' \item count
+#' \item mean
+#' \item stddev
+#' \item min
+#' \item max
+#' \item arbitrary approximate percentiles specified as a percentage (eg, "75%")
+#' }
+#' If no statistics are given, this function computes count, mean, stddev, min,
+#' approximate quartiles (percentiles at 25%, 50%, and 75%), and max.
+#' This function is meant for exploratory data analysis, as we make no guarantee about the
+#' backward compatibility of the schema of the resulting Dataset. If you want to
+#' programmatically compute summary statistics, use the \code{agg} function instead.
+#'
+#'
 #' @param object a SparkDataFrame to be summarized.
+#' @param ... (optional) statistics to be computed for all columns.
+#' @return A SparkDataFrame.
+#' @family SparkDataFrame functions
 #' @rdname summary
 #' @name summary
 #' @aliases summary,SparkDataFrame-method
+#' @export
+#' @examples
+#'\dontrun{
+#' sparkR.session()
+#' path <- "path/to/file.json"
+#' df <- read.json(path)
+#' summary(df)
+#' summary(df, "min", "25%", "75%", "max")
+#' summary(select(df, "age", "height"))
+#' }
 #' @note summary(SparkDataFrame) since 1.5.0
+#' @note The statistics provided by \code{summary} were change in 2.3.0 use \link{describe} for previous defaults.
+#' @seealso \link{describe}
 setMethod("summary",
           signature(object = "SparkDataFrame"),
           function(object, ...) {
-            describe(object)
+            statisticsList <- list(...)
+            sdf <- callJMethod(object@sdf, "summary", statisticsList)
+            dataFrame(sdf)
           })
 
 

diff --git a/R/pkg/R/functions.R b/R/pkg/R/functions.R
@@ -176,7 +176,8 @@ NULL
 #'
 #' @param x Column to compute on. Note the difference in the following methods:
 #'          \itemize{
-#'          \item \code{to_json}: it is the column containing the struct or array of the structs.
+#'          \item \code{to_json}: it is the column containing the struct, array of the structs,
+#'              the map or array of maps.
 #'          \item \code{from_json}: it is the column containing the JSON string.
 #'          }
 #' @param ... additional argument(s). In \code{to_json} and \code{from_json}, this contains
@@ -1700,8 +1701,9 @@ setMethod("to_date",
           })
 
 #' @details
-#' \code{to_json}: Converts a column containing a \code{structType} or array of \code{structType}
-#' into a Column of JSON string. Resolving the Column can fail if an unsupported type is encountered.
+#' \code{to_json}: Converts a column containing a \code{structType}, array of \code{structType},
+#' a \code{mapType} or array of \code{mapType} into a Column of JSON string.
+#' Resolving the Column can fail if an unsupported type is encountered.
 #'
 #' @rdname column_collection_functions
 #' @aliases to_json to_json,Column-method
@@ -1715,6 +1717,14 @@ setMethod("to_date",
 #'
 #' # Converts an array of structs into a JSON array
 #' df2 <- sql("SELECT array(named_struct('name', 'Bob'), named_struct('name', 'Alice')) as people")
+#' df2 <- mutate(df2, people_json = to_json(df2$people))
+#'
+#' # Converts a map into a JSON object
+#' df2 <- sql("SELECT map('name', 'Bob')) as people")
+#' df2 <- mutate(df2, people_json = to_json(df2$people))
+#'
+#' # Converts an array of maps into a JSON array
+#' df2 <- sql("SELECT array(map('name', 'Bob'), map('name', 'Alice')) as people")
 #' df2 <- mutate(df2, people_json = to_json(df2$people))}
 #' @note to_json since 2.2.0
 setMethod("to_json", signature(x = "Column"),

diff --git a/R/pkg/R/generics.R b/R/pkg/R/generics.R
@@ -521,7 +521,7 @@ setGeneric("gapplyCollect", function(x, ...) { standardGeneric("gapplyCollect")
 # @export
 setGeneric("getNumPartitions", function(x) { standardGeneric("getNumPartitions") })
 
-#' @rdname summary
+#' @rdname describe
 #' @export
 setGeneric("describe", function(x, col, ...) { standardGeneric("describe") })
 
@@ -769,6 +769,10 @@ setGeneric("union", function(x, y) { standardGeneric("union") })
 #' @export
 setGeneric("unionAll", function(x, y) { standardGeneric("unionAll") })
 
+#' @rdname unionByName
+#' @export
+setGeneric("unionByName", function(x, y) { standardGeneric("unionByName") })
+
 #' @rdname unpersist
 #' @export
 setGeneric("unpersist", function(x, ...) { standardGeneric("unpersist") })

diff --git a/R/pkg/R/install.R b/R/pkg/R/install.R
@@ -270,7 +270,11 @@ sparkCachePath <- function() {
   if (is_windows()) {
     winAppPath <- Sys.getenv("LOCALAPPDATA", unset = NA)
     if (is.na(winAppPath)) {
-      stop(paste("%LOCALAPPDATA% not found.",
+      message("%LOCALAPPDATA% not found. Falling back to %USERPROFILE%.")
+      winAppPath <- Sys.getenv("USERPROFILE", unset = NA)
+    }
+    if (is.na(winAppPath)) {
+      stop(paste("%LOCALAPPDATA% and %USERPROFILE% not found.",
                    "Please define the environment variable",
                    "or restart and enter an installation path in localDir."))
     } else {

diff --git a/R/pkg/tests/fulltests/test_mllib_tree.R b/R/pkg/tests/fulltests/test_mllib_tree.R
@@ -66,7 +66,7 @@ test_that("spark.gbt", {
   # label must be binary - GBTClassifier currently only supports binary classification.
   iris2 <- iris[iris$Species != "virginica", ]
   data <- suppressWarnings(createDataFrame(iris2))
-  model <- spark.gbt(data, Species ~ Petal_Length + Petal_Width, "classification")
+  model <- spark.gbt(data, Species ~ Petal_Length + Petal_Width, "classification", seed = 12)
   stats <- summary(model)
   expect_equal(stats$numFeatures, 2)
   expect_equal(stats$numTrees, 20)
@@ -94,7 +94,7 @@ test_that("spark.gbt", {
 
   iris2$NumericSpecies <- ifelse(iris2$Species == "setosa", 0, 1)
   df <- suppressWarnings(createDataFrame(iris2))
-  m <- spark.gbt(df, NumericSpecies ~ ., type = "classification")
+  m <- spark.gbt(df, NumericSpecies ~ ., type = "classification", seed = 12)
   s <- summary(m)
   # test numeric prediction values
   expect_equal(iris2$NumericSpecies, as.double(collect(predict(m, df))$prediction))
@@ -106,7 +106,7 @@ test_that("spark.gbt", {
   if (windows_with_hadoop()) {
     data <- read.df(absoluteSparkPath("data/mllib/sample_binary_classification_data.txt"),
                   source = "libsvm")
-    model <- spark.gbt(data, label ~ features, "classification")
+    model <- spark.gbt(data, label ~ features, "classification", seed = 12)
     expect_equal(summary(model)$numFeatures, 692)
   }
 
@@ -117,10 +117,11 @@ test_that("spark.gbt", {
   trainidxs <- base::sample(nrow(data), nrow(data) * 0.7)
   traindf <- as.DataFrame(data[trainidxs, ])
   testdf <- as.DataFrame(rbind(data[-trainidxs, ], c(0, "the other")))
-  model <- spark.gbt(traindf, clicked ~ ., type = "classification")
+  model <- spark.gbt(traindf, clicked ~ ., type = "classification", seed = 23)
   predictions <- predict(model, testdf)
   expect_error(collect(predictions))
-  model <- spark.gbt(traindf, clicked ~ ., type = "classification", handleInvalid = "keep")
+  model <- spark.gbt(traindf, clicked ~ ., type = "classification", handleInvalid = "keep",
+                    seed = 23)
   predictions <- predict(model, testdf)
   expect_equal(class(collect(predictions)$clicked[1]), "character")
 })
@@ -129,7 +130,7 @@ test_that("spark.randomForest", {
   # regression
   data <- suppressWarnings(createDataFrame(longley))
   model <- spark.randomForest(data, Employed ~ ., "regression", maxDepth = 5, maxBins = 16,
-                              numTrees = 1)
+                              numTrees = 1, seed = 1)
 
   predictions <- collect(predict(model, data))
   expect_equal(predictions$prediction, c(60.323, 61.122, 60.171, 61.187,
@@ -177,7 +178,7 @@ test_that("spark.randomForest", {
   # classification
   data <- suppressWarnings(createDataFrame(iris))
   model <- spark.randomForest(data, Species ~ Petal_Length + Petal_Width, "classification",
-                              maxDepth = 5, maxBins = 16)
+                              maxDepth = 5, maxBins = 16, seed = 123)
 
   stats <- summary(model)
   expect_equal(stats$numFeatures, 2)
@@ -215,7 +216,7 @@ test_that("spark.randomForest", {
   iris$NumericSpecies <- lapply(iris$Species, labelToIndex)
   data <- suppressWarnings(createDataFrame(iris[-5]))
   model <- spark.randomForest(data, NumericSpecies ~ Petal_Length + Petal_Width, "classification",
-                              maxDepth = 5, maxBins = 16)
+                              maxDepth = 5, maxBins = 16, seed = 123)
   stats <- summary(model)
   expect_equal(stats$numFeatures, 2)
   expect_equal(stats$numTrees, 20)
@@ -234,28 +235,29 @@ test_that("spark.randomForest", {
   traindf <- as.DataFrame(data[trainidxs, ])
   testdf <- as.DataFrame(rbind(data[-trainidxs, ], c(0, "the other")))
   model <- spark.randomForest(traindf, clicked ~ ., type = "classification",
-                          maxDepth = 10, maxBins = 10, numTrees = 10)
+                          maxDepth = 10, maxBins = 10, numTrees = 10, seed = 123)
   predictions <- predict(model, testdf)
   expect_error(collect(predictions))
   model <- spark.randomForest(traindf, clicked ~ ., type = "classification",
                              maxDepth = 10, maxBins = 10, numTrees = 10,
-                             handleInvalid = "keep")
+                             handleInvalid = "keep", seed = 123)
   predictions <- predict(model, testdf)
   expect_equal(class(collect(predictions)$clicked[1]), "character")
 
   # spark.randomForest classification can work on libsvm data
   if (windows_with_hadoop()) {
     data <- read.df(absoluteSparkPath("data/mllib/sample_multiclass_classification_data.txt"),
                   source = "libsvm")
-    model <- spark.randomForest(data, label ~ features, "classification")
+    model <- spark.randomForest(data, label ~ features, "classification", seed = 123)
     expect_equal(summary(model)$numFeatures, 4)
   }
 })
 
 test_that("spark.decisionTree", {
   # regression
   data <- suppressWarnings(createDataFrame(longley))
-  model <- spark.decisionTree(data, Employed ~ ., "regression", maxDepth = 5, maxBins = 16)
+  model <- spark.decisionTree(data, Employed ~ ., "regression", maxDepth = 5, maxBins = 16,
+                            seed = 42)
 
   predictions <- collect(predict(model, data))
   expect_equal(predictions$prediction, c(60.323, 61.122, 60.171, 61.187,
@@ -288,7 +290,7 @@ test_that("spark.decisionTree", {
   # classification
   data <- suppressWarnings(createDataFrame(iris))
   model <- spark.decisionTree(data, Species ~ Petal_Length + Petal_Width, "classification",
-                              maxDepth = 5, maxBins = 16)
+                              maxDepth = 5, maxBins = 16, seed = 43)
 
   stats <- summary(model)
   expect_equal(stats$numFeatures, 2)
@@ -325,7 +327,7 @@ test_that("spark.decisionTree", {
   iris$NumericSpecies <- lapply(iris$Species, labelToIndex)
   data <- suppressWarnings(createDataFrame(iris[-5]))
   model <- spark.decisionTree(data, NumericSpecies ~ Petal_Length + Petal_Width, "classification",
-                              maxDepth = 5, maxBins = 16)
+                              maxDepth = 5, maxBins = 16, seed = 44)
   stats <- summary(model)
   expect_equal(stats$numFeatures, 2)
   expect_equal(stats$maxDepth, 5)
@@ -339,7 +341,7 @@ test_that("spark.decisionTree", {
   if (windows_with_hadoop()) {
     data <- read.df(absoluteSparkPath("data/mllib/sample_multiclass_classification_data.txt"),
                   source = "libsvm")
-    model <- spark.decisionTree(data, label ~ features, "classification")
+    model <- spark.decisionTree(data, label ~ features, "classification", seed = 45)
     expect_equal(summary(model)$numFeatures, 4)
   }
 
@@ -351,11 +353,11 @@ test_that("spark.decisionTree", {
   traindf <- as.DataFrame(data[trainidxs, ])
   testdf <- as.DataFrame(rbind(data[-trainidxs, ], c(0, "the other")))
   model <- spark.decisionTree(traindf, clicked ~ ., type = "classification",
-                              maxDepth = 5, maxBins = 16)
+                              maxDepth = 5, maxBins = 16, seed = 46)
   predictions <- predict(model, testdf)
   expect_error(collect(predictions))
   model <- spark.decisionTree(traindf, clicked ~ ., type = "classification",
-                              maxDepth = 5, maxBins = 16, handleInvalid = "keep")
+                              maxDepth = 5, maxBins = 16, handleInvalid = "keep", seed = 46)
   predictions <- predict(model, testdf)
   expect_equal(class(collect(predictions)$clicked[1]), "character")
 })