Skip to content

Commit

Permalink
remove more docs
Browse files Browse the repository at this point in the history
  • Loading branch information
hqzizania committed May 8, 2015
1 parent 6394579 commit eb4b095
Show file tree
Hide file tree
Showing 4 changed files with 155 additions and 155 deletions.
2 changes: 1 addition & 1 deletion R/pkg/DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -15,11 +15,11 @@ Suggests:
Description: R frontend for Spark
License: Apache License (== 2.0)
Collate:
'schema.R'
'generics.R'
'jobj.R'
'RDD.R'
'pairRDD.R'
'schema.R'
'column.R'
'group.R'
'DataFrame.R'
Expand Down
64 changes: 32 additions & 32 deletions R/pkg/R/broadcast.R
Original file line number Diff line number Diff line change
Expand Up @@ -23,35 +23,35 @@
.broadcastValues <- new.env()
.broadcastIdToName <- new.env()

#' @title S4 class that represents a Broadcast variable
#' @description Broadcast variables can be created using the broadcast
#' function from a \code{SparkContext}.
#' @rdname broadcast-class
#' @seealso broadcast
#'
#' @param id Id of the backing Spark broadcast variable
#' @export
# @title S4 class that represents a Broadcast variable
# @description Broadcast variables can be created using the broadcast
# function from a \code{SparkContext}.
# @rdname broadcast-class
# @seealso broadcast
#
# @param id Id of the backing Spark broadcast variable
# @export
setClass("Broadcast", slots = list(id = "character"))

#' @rdname broadcast-class
#' @param value Value of the broadcast variable
#' @param jBroadcastRef reference to the backing Java broadcast object
#' @param objName name of broadcasted object
#' @export
# @rdname broadcast-class
# @param value Value of the broadcast variable
# @param jBroadcastRef reference to the backing Java broadcast object
# @param objName name of broadcasted object
# @export
Broadcast <- function(id, value, jBroadcastRef, objName) {
.broadcastValues[[id]] <- value
.broadcastNames[[as.character(objName)]] <- jBroadcastRef
.broadcastIdToName[[id]] <- as.character(objName)
new("Broadcast", id = id)
}

#' @description
#' \code{value} can be used to get the value of a broadcast variable inside
#' a distributed function.
#'
#' @param bcast The broadcast variable to get
#' @rdname broadcast
#' @aliases value,Broadcast-method
# @description
# \code{value} can be used to get the value of a broadcast variable inside
# a distributed function.
#
# @param bcast The broadcast variable to get
# @rdname broadcast
# @aliases value,Broadcast-method
setMethod("value",
signature(bcast = "Broadcast"),
function(bcast) {
Expand All @@ -62,24 +62,24 @@ setMethod("value",
}
})

#' Internal function to set values of a broadcast variable.
#'
#' This function is used internally by Spark to set the value of a broadcast
#' variable on workers. Not intended for use outside the package.
#'
#' @rdname broadcast-internal
#' @seealso broadcast, value
# Internal function to set values of a broadcast variable.
#
# This function is used internally by Spark to set the value of a broadcast
# variable on workers. Not intended for use outside the package.
#
# @rdname broadcast-internal
# @seealso broadcast, value

#' @param bcastId The id of broadcast variable to set
#' @param value The value to be set
#' @export
# @param bcastId The id of broadcast variable to set
# @param value The value to be set
# @export
setBroadcastValue <- function(bcastId, value) {
bcastIdStr <- as.character(bcastId)
.broadcastValues[[bcastIdStr]] <- value
}

#' Helper function to clear the list of broadcast variables we know about
#' Should be called when the SparkR JVM backend is shutdown
# Helper function to clear the list of broadcast variables we know about
# Should be called when the SparkR JVM backend is shutdown
clearBroadcastVariables <- function() {
bcasts <- ls(.broadcastNames)
rm(list = bcasts, envir = .broadcastNames)
Expand Down
240 changes: 120 additions & 120 deletions R/pkg/R/context.R
Original file line number Diff line number Diff line change
Expand Up @@ -25,80 +25,80 @@ getMinPartitions <- function(sc, minPartitions) {
as.integer(minPartitions)
}

#' Create an RDD from a text file.
#'
#' This function reads a text file from HDFS, a local file system (available on all
#' nodes), or any Hadoop-supported file system URI, and creates an
#' RDD of strings from it.
#'
#' @param sc SparkContext to use
#' @param path Path of file to read. A vector of multiple paths is allowed.
#' @param minPartitions Minimum number of partitions to be created. If NULL, the default
#' value is chosen based on available parallelism.
#' @return RDD where each item is of type \code{character}
#' @export
#' @examples
#'\dontrun{
#' sc <- sparkR.init()
#' lines <- textFile(sc, "myfile.txt")
#'}
# Create an RDD from a text file.
#
# This function reads a text file from HDFS, a local file system (available on all
# nodes), or any Hadoop-supported file system URI, and creates an
# RDD of strings from it.
#
# @param sc SparkContext to use
# @param path Path of file to read. A vector of multiple paths is allowed.
# @param minPartitions Minimum number of partitions to be created. If NULL, the default
# value is chosen based on available parallelism.
# @return RDD where each item is of type \code{character}
# @export
# @examples
#\dontrun{
# sc <- sparkR.init()
# lines <- textFile(sc, "myfile.txt")
#}
textFile <- function(sc, path, minPartitions = NULL) {
# Allow the user to have a more flexible definiton of the text file path
path <- suppressWarnings(normalizePath(path))
#' Convert a string vector of paths to a string containing comma separated paths
# Convert a string vector of paths to a string containing comma separated paths
path <- paste(path, collapse = ",")

jrdd <- callJMethod(sc, "textFile", path, getMinPartitions(sc, minPartitions))
# jrdd is of type JavaRDD[String]
RDD(jrdd, "string")
}

#' Load an RDD saved as a SequenceFile containing serialized objects.
#'
#' The file to be loaded should be one that was previously generated by calling
#' saveAsObjectFile() of the RDD class.
#'
#' @param sc SparkContext to use
#' @param path Path of file to read. A vector of multiple paths is allowed.
#' @param minPartitions Minimum number of partitions to be created. If NULL, the default
#' value is chosen based on available parallelism.
#' @return RDD containing serialized R objects.
#' @seealso saveAsObjectFile
#' @export
#' @examples
#'\dontrun{
#' sc <- sparkR.init()
#' rdd <- objectFile(sc, "myfile")
#'}
# Load an RDD saved as a SequenceFile containing serialized objects.
#
# The file to be loaded should be one that was previously generated by calling
# saveAsObjectFile() of the RDD class.
#
# @param sc SparkContext to use
# @param path Path of file to read. A vector of multiple paths is allowed.
# @param minPartitions Minimum number of partitions to be created. If NULL, the default
# value is chosen based on available parallelism.
# @return RDD containing serialized R objects.
# @seealso saveAsObjectFile
# @export
# @examples
#\dontrun{
# sc <- sparkR.init()
# rdd <- objectFile(sc, "myfile")
#}
objectFile <- function(sc, path, minPartitions = NULL) {
# Allow the user to have a more flexible definiton of the text file path
path <- suppressWarnings(normalizePath(path))
#' Convert a string vector of paths to a string containing comma separated paths
# Convert a string vector of paths to a string containing comma separated paths
path <- paste(path, collapse = ",")

jrdd <- callJMethod(sc, "objectFile", path, getMinPartitions(sc, minPartitions))
# Assume the RDD contains serialized R objects.
RDD(jrdd, "byte")
}

#' Create an RDD from a homogeneous list or vector.
#'
#' This function creates an RDD from a local homogeneous list in R. The elements
#' in the list are split into \code{numSlices} slices and distributed to nodes
#' in the cluster.
#'
#' @param sc SparkContext to use
#' @param coll collection to parallelize
#' @param numSlices number of partitions to create in the RDD
#' @return an RDD created from this collection
#' @export
#' @examples
#'\dontrun{
#' sc <- sparkR.init()
#' rdd <- parallelize(sc, 1:10, 2)
#' # The RDD should contain 10 elements
#' length(rdd)
#'}
# Create an RDD from a homogeneous list or vector.
#
# This function creates an RDD from a local homogeneous list in R. The elements
# in the list are split into \code{numSlices} slices and distributed to nodes
# in the cluster.
#
# @param sc SparkContext to use
# @param coll collection to parallelize
# @param numSlices number of partitions to create in the RDD
# @return an RDD created from this collection
# @export
# @examples
#\dontrun{
# sc <- sparkR.init()
# rdd <- parallelize(sc, 1:10, 2)
# # The RDD should contain 10 elements
# length(rdd)
#}
parallelize <- function(sc, coll, numSlices = 1) {
# TODO: bound/safeguard numSlices
# TODO: unit tests for if the split works for all primitives
Expand Down Expand Up @@ -133,33 +133,33 @@ parallelize <- function(sc, coll, numSlices = 1) {
RDD(jrdd, "byte")
}

#' Include this specified package on all workers
#'
#' This function can be used to include a package on all workers before the
#' user's code is executed. This is useful in scenarios where other R package
#' functions are used in a function passed to functions like \code{lapply}.
#' NOTE: The package is assumed to be installed on every node in the Spark
#' cluster.
#'
#' @param sc SparkContext to use
#' @param pkg Package name
#'
#' @export
#' @examples
#'\dontrun{
#' library(Matrix)
#'
#' sc <- sparkR.init()
#' # Include the matrix library we will be using
#' includePackage(sc, Matrix)
#'
#' generateSparse <- function(x) {
#' sparseMatrix(i=c(1, 2, 3), j=c(1, 2, 3), x=c(1, 2, 3))
#' }
#'
#' rdd <- lapplyPartition(parallelize(sc, 1:2, 2L), generateSparse)
#' collect(rdd)
#'}
# Include this specified package on all workers
#
# This function can be used to include a package on all workers before the
# user's code is executed. This is useful in scenarios where other R package
# functions are used in a function passed to functions like \code{lapply}.
# NOTE: The package is assumed to be installed on every node in the Spark
# cluster.
#
# @param sc SparkContext to use
# @param pkg Package name
#
# @export
# @examples
#\dontrun{
# library(Matrix)
#
# sc <- sparkR.init()
# # Include the matrix library we will be using
# includePackage(sc, Matrix)
#
# generateSparse <- function(x) {
# sparseMatrix(i=c(1, 2, 3), j=c(1, 2, 3), x=c(1, 2, 3))
# }
#
# rdd <- lapplyPartition(parallelize(sc, 1:2, 2L), generateSparse)
# collect(rdd)
#}
includePackage <- function(sc, pkg) {
pkg <- as.character(substitute(pkg))
if (exists(".packages", .sparkREnv)) {
Expand All @@ -171,30 +171,30 @@ includePackage <- function(sc, pkg) {
.sparkREnv$.packages <- packages
}

#' @title Broadcast a variable to all workers
#'
#' @description
#' Broadcast a read-only variable to the cluster, returning a \code{Broadcast}
#' object for reading it in distributed functions.
#'
#' @param sc Spark Context to use
#' @param object Object to be broadcast
#' @export
#' @examples
#'\dontrun{
#' sc <- sparkR.init()
#' rdd <- parallelize(sc, 1:2, 2L)
#'
#' # Large Matrix object that we want to broadcast
#' randomMat <- matrix(nrow=100, ncol=10, data=rnorm(1000))
#' randomMatBr <- broadcast(sc, randomMat)
#'
#' # Use the broadcast variable inside the function
#' useBroadcast <- function(x) {
#' sum(value(randomMatBr) * x)
#' }
#' sumRDD <- lapply(rdd, useBroadcast)
#'}
# @title Broadcast a variable to all workers
#
# @description
# Broadcast a read-only variable to the cluster, returning a \code{Broadcast}
# object for reading it in distributed functions.
#
# @param sc Spark Context to use
# @param object Object to be broadcast
# @export
# @examples
#\dontrun{
# sc <- sparkR.init()
# rdd <- parallelize(sc, 1:2, 2L)
#
# # Large Matrix object that we want to broadcast
# randomMat <- matrix(nrow=100, ncol=10, data=rnorm(1000))
# randomMatBr <- broadcast(sc, randomMat)
#
# # Use the broadcast variable inside the function
# useBroadcast <- function(x) {
# sum(value(randomMatBr) * x)
# }
# sumRDD <- lapply(rdd, useBroadcast)
#}
broadcast <- function(sc, object) {
objName <- as.character(substitute(object))
serializedObj <- serialize(object, connection = NULL)
Expand All @@ -205,21 +205,21 @@ broadcast <- function(sc, object) {
Broadcast(id, object, jBroadcast, objName)
}

#' @title Set the checkpoint directory
#'
#' Set the directory under which RDDs are going to be checkpointed. The
#' directory must be a HDFS path if running on a cluster.
#'
#' @param sc Spark Context to use
#' @param dirName Directory path
#' @export
#' @examples
#'\dontrun{
#' sc <- sparkR.init()
#' setCheckpointDir(sc, "~/checkpoint")
#' rdd <- parallelize(sc, 1:2, 2L)
#' checkpoint(rdd)
#'}
# @title Set the checkpoint directory
#
# Set the directory under which RDDs are going to be checkpointed. The
# directory must be a HDFS path if running on a cluster.
#
# @param sc Spark Context to use
# @param dirName Directory path
# @export
# @examples
#\dontrun{
# sc <- sparkR.init()
# setCheckpointDir(sc, "~/checkpoint")
# rdd <- parallelize(sc, 1:2, 2L)
# checkpoint(rdd)
#}
setCheckpointDir <- function(sc, dirName) {
invisible(callJMethod(sc, "setCheckpointDir", suppressWarnings(normalizePath(dirName))))
}
Loading

0 comments on commit eb4b095

Please sign in to comment.