Skip to content

Commit

Permalink
Fill the docs for DataFrame API in SparkR
Browse files Browse the repository at this point in the history
  • Loading branch information
hqzizania committed May 7, 2015
1 parent 857220f commit 6813860
Show file tree
Hide file tree
Showing 2 changed files with 81 additions and 77 deletions.
94 changes: 49 additions & 45 deletions R/pkg/R/DataFrame.R
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,9 @@ setMethod("initialize", "DataFrame", function(.Object, sdf, isCached) {

#' @rdname DataFrame
#' @export
#'
#' @param sdf A Java object reference to the backing Scala DataFrame
#' @param isCached TRUE if the dataFrame is cached
dataFrame <- function(sdf, isCached = FALSE) {
new("DataFrame", sdf, isCached)
}
Expand Down Expand Up @@ -244,7 +247,7 @@ setMethod("columns",
})

#' @rdname columns
#' @export
#' @aliases names,DataFrame,function-method
setMethod("names",
signature(x = "DataFrame"),
function(x) {
Expand Down Expand Up @@ -399,23 +402,23 @@ setMethod("repartition",
dataFrame(sdf)
})

#' toJSON
#'
#' Convert the rows of a DataFrame into JSON objects and return an RDD where
#' each element contains a JSON string.
#'
#' @param x A SparkSQL DataFrame
#' @return A StringRRDD of JSON objects
#' @rdname tojson
#' @export
#' @examples
#'\dontrun{
#' sc <- sparkR.init()
#' sqlCtx <- sparkRSQL.init(sc)
#' path <- "path/to/file.json"
#' df <- jsonFile(sqlCtx, path)
#' newRDD <- toJSON(df)
#'}
# toJSON
#
# Convert the rows of a DataFrame into JSON objects and return an RDD where
# each element contains a JSON string.
#
#@param x A SparkSQL DataFrame
# @return A StringRRDD of JSON objects
# @rdname tojson
# @export
# @examples
#\dontrun{
# sc <- sparkR.init()
# sqlCtx <- sparkRSQL.init(sc)
# path <- "path/to/file.json"
# df <- jsonFile(sqlCtx, path)
# newRDD <- toJSON(df)
#}
setMethod("toJSON",
signature(x = "DataFrame"),
function(x) {
Expand Down Expand Up @@ -578,8 +581,8 @@ setMethod("limit",
dataFrame(res)
})

# Take the first NUM rows of a DataFrame and return a the results as a data.frame

#' Take the first NUM rows of a DataFrame and return a the results as a data.frame
#'
#' @rdname take
#' @export
#' @examples
Expand Down Expand Up @@ -644,22 +647,22 @@ setMethod("first",
take(x, 1)
})

#' toRDD()
#'
#' Converts a Spark DataFrame to an RDD while preserving column names.
#'
#' @param x A Spark DataFrame
#'
#' @rdname DataFrame
#' @export
#' @examples
#'\dontrun{
#' sc <- sparkR.init()
#' sqlCtx <- sparkRSQL.init(sc)
#' path <- "path/to/file.json"
#' df <- jsonFile(sqlCtx, path)
#' rdd <- toRDD(df)
#' }
# toRDD()
#
# Converts a Spark DataFrame to an RDD while preserving column names.
#
# @param x A Spark DataFrame
#
# @rdname DataFrame
# @export
# @examples
#\dontrun{
# sc <- sparkR.init()
# sqlCtx <- sparkRSQL.init(sc)
# path <- "path/to/file.json"
# df <- jsonFile(sqlCtx, path)
# rdd <- toRDD(df)
# }
setMethod("toRDD",
signature(x = "DataFrame"),
function(x) {
Expand Down Expand Up @@ -706,6 +709,7 @@ setMethod("groupBy",
#'
#' Compute aggregates by specifying a list of columns
#'
#' @param x a DataFrame
#' @rdname DataFrame
#' @export
setMethod("agg",
Expand All @@ -721,53 +725,53 @@ setMethod("agg",
# the requested map function. #
###################################################################################

#' @rdname lapply
# @rdname lapply
setMethod("lapply",
signature(X = "DataFrame", FUN = "function"),
function(X, FUN) {
rdd <- toRDD(X)
lapply(rdd, FUN)
})

#' @rdname lapply
# @rdname lapply
setMethod("map",
signature(X = "DataFrame", FUN = "function"),
function(X, FUN) {
lapply(X, FUN)
})

#' @rdname flatMap
# @rdname flatMap
setMethod("flatMap",
signature(X = "DataFrame", FUN = "function"),
function(X, FUN) {
rdd <- toRDD(X)
flatMap(rdd, FUN)
})

#' @rdname lapplyPartition
# @rdname lapplyPartition
setMethod("lapplyPartition",
signature(X = "DataFrame", FUN = "function"),
function(X, FUN) {
rdd <- toRDD(X)
lapplyPartition(rdd, FUN)
})

#' @rdname lapplyPartition
# @rdname lapplyPartition
setMethod("mapPartitions",
signature(X = "DataFrame", FUN = "function"),
function(X, FUN) {
lapplyPartition(X, FUN)
})

#' @rdname foreach
# @rdname foreach
setMethod("foreach",
signature(x = "DataFrame", func = "function"),
function(x, func) {
rdd <- toRDD(x)
foreach(rdd, func)
})

#' @rdname foreach
# @rdname foreach
setMethod("foreachPartition",
signature(x = "DataFrame", func = "function"),
function(x, func) {
Expand Down Expand Up @@ -1009,7 +1013,7 @@ setMethod("sortDF",
})

#' @rdname sortDF
#' @export
#' @aliases orderBy,DataFrame,function-method
setMethod("orderBy",
signature(x = "DataFrame", col = "characterOrColumn"),
function(x, col) {
Expand Down Expand Up @@ -1046,7 +1050,7 @@ setMethod("filter",
})

#' @rdname filter
#' @export
#' @aliases where,DataFrame,function-method
setMethod("where",
signature(x = "DataFrame", condition = "characterOrColumn"),
function(x, condition) {
Expand Down
64 changes: 32 additions & 32 deletions R/pkg/R/SQLContext.R
Original file line number Diff line number Diff line change
Expand Up @@ -150,21 +150,21 @@ createDataFrame <- function(sqlCtx, data, schema = NULL, samplingRatio = 1.0) {
dataFrame(sdf)
}

#' toDF
#'
#' Converts an RDD to a DataFrame by infer the types.
#'
#' @param x An RDD
#'
#' @rdname DataFrame
#' @export
#' @examples
#'\dontrun{
#' sc <- sparkR.init()
#' sqlCtx <- sparkRSQL.init(sc)
#' rdd <- lapply(parallelize(sc, 1:10), function(x) list(a=x, b=as.character(x)))
#' df <- toDF(rdd)
#' }
# toDF
#
# Converts an RDD to a DataFrame by infer the types.
#
# @param x An RDD
#
# @rdname DataFrame
# @export
# @examples
#\dontrun{
# sc <- sparkR.init()
# sqlCtx <- sparkRSQL.init(sc)
# rdd <- lapply(parallelize(sc, 1:10), function(x) list(a=x, b=as.character(x)))
# df <- toDF(rdd)
# }

setGeneric("toDF", function(x, ...) { standardGeneric("toDF") })

Expand Down Expand Up @@ -207,23 +207,23 @@ jsonFile <- function(sqlCtx, path) {
}


#' JSON RDD
#'
#' Loads an RDD storing one JSON object per string as a DataFrame.
#'
#' @param sqlCtx SQLContext to use
#' @param rdd An RDD of JSON string
#' @param schema A StructType object to use as schema
#' @param samplingRatio The ratio of simpling used to infer the schema
#' @return A DataFrame
#' @export
#' @examples
#'\dontrun{
#' sc <- sparkR.init()
#' sqlCtx <- sparkRSQL.init(sc)
#' rdd <- texFile(sc, "path/to/json")
#' df <- jsonRDD(sqlCtx, rdd)
#' }
# JSON RDD
#
# Loads an RDD storing one JSON object per string as a DataFrame.
#
# @param sqlCtx SQLContext to use
# @param rdd An RDD of JSON string
# @param schema A StructType object to use as schema
# @param samplingRatio The ratio of simpling used to infer the schema
# @return A DataFrame
# @export
# @examples
#\dontrun{
# sc <- sparkR.init()
# sqlCtx <- sparkRSQL.init(sc)
# rdd <- texFile(sc, "path/to/json")
# df <- jsonRDD(sqlCtx, rdd)
# }

# TODO: support schema
jsonRDD <- function(sqlCtx, rdd, schema = NULL, samplingRatio = 1.0) {
Expand Down

0 comments on commit 6813860

Please sign in to comment.