diff --git a/sql/core/src/main/scala/org/apache/spark/sql/Column.scala b/sql/core/src/main/scala/org/apache/spark/sql/Column.scala index c0503bf047052..8eb632d3d600b 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/Column.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/Column.scala @@ -84,14 +84,14 @@ class Column(protected[sql] val expr: Expression) extends Logging { /** * Inversion of boolean expression, i.e. NOT. - * {{ + * {{{ * // Scala: select rows that are not active (isActive === false) * df.filter( !df("isActive") ) * * // Java: * import static org.apache.spark.sql.functions.*; * df.filter( not(df.col("isActive")) ); - * }} + * }}} * * @group expr_ops */ diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala index aceb7a9627edd..9d2cd7aae3b82 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala @@ -681,11 +681,11 @@ class DataFrame private[sql]( /** * (Scala-specific) Aggregates on the entire [[DataFrame]] without groups. - * {{ + * {{{ * // df.agg(...) is a shorthand for df.groupBy().agg(...) * df.agg("age" -> "max", "salary" -> "avg") * df.groupBy().agg("age" -> "max", "salary" -> "avg") - * }} + * }}} * @group dfops */ def agg(aggExpr: (String, String), aggExprs: (String, String)*): DataFrame = { @@ -694,33 +694,33 @@ class DataFrame private[sql]( /** * (Scala-specific) Aggregates on the entire [[DataFrame]] without groups. - * {{ + * {{{ * // df.agg(...) is a shorthand for df.groupBy().agg(...) * df.agg(Map("age" -> "max", "salary" -> "avg")) * df.groupBy().agg(Map("age" -> "max", "salary" -> "avg")) - * }} + * }}} * @group dfops */ def agg(exprs: Map[String, String]): DataFrame = groupBy().agg(exprs) /** * (Java-specific) Aggregates on the entire [[DataFrame]] without groups. - * {{ + * {{{ * // df.agg(...) is a shorthand for df.groupBy().agg(...) * df.agg(Map("age" -> "max", "salary" -> "avg")) * df.groupBy().agg(Map("age" -> "max", "salary" -> "avg")) - * }} + * }}} * @group dfops */ def agg(exprs: java.util.Map[String, String]): DataFrame = groupBy().agg(exprs) /** * Aggregates on the entire [[DataFrame]] without groups. - * {{ + * {{{ * // df.agg(...) is a shorthand for df.groupBy().agg(...) * df.agg(max($"age"), avg($"salary")) * df.groupBy().agg(max($"age"), avg($"salary")) - * }} + * }}} * @group dfops */ @scala.annotation.varargs diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala index cb88deab35968..a1e74470afc89 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala @@ -37,7 +37,7 @@ final class DataFrameStatFunctions private[sql](df: DataFrame) { StatFunctions.calculateCov(df, Seq(col1, col2)) } - /* + /** * Calculates the correlation of two columns of a DataFrame. Currently only supports the Pearson * Correlation Coefficient. For Spearman Correlation, consider using RDD methods found in * MLlib's Statistics. @@ -75,7 +75,7 @@ final class DataFrameStatFunctions private[sql](df: DataFrame) { * each row. * @param col2 The name of the second column. Distinct items will make the column names * of the DataFrame. - * @return A Local DataFrame containing the table + * @return A DataFrame containing for the contingency table. */ def crosstab(col1: String, col2: String): DataFrame = { StatFunctions.crossTabulate(df, col1, col2) @@ -110,14 +110,25 @@ final class DataFrameStatFunctions private[sql](df: DataFrame) { } /** - * Python friendly implementation for `freqItems` + * (Scala-specific) Finding frequent items for columns, possibly with false positives. Using the + * frequent element count algorithm described in + * [[http://dx.doi.org/10.1145/762471.762473, proposed by Karp, Schenker, and Papadimitriou]]. + * + * @param cols the names of the columns to search frequent items in. + * @return A Local DataFrame with the Array of frequent items for each column. */ def freqItems(cols: Seq[String], support: Double): DataFrame = { FrequentItems.singlePassFreqItems(df, cols, support) } /** - * Python friendly implementation for `freqItems` with a default `support` of 1%. + * (Scala-specific) Finding frequent items for columns, possibly with false positives. Using the + * frequent element count algorithm described in + * [[http://dx.doi.org/10.1145/762471.762473, proposed by Karp, Schenker, and Papadimitriou]]. + * Uses a `default` support of 1%. + * + * @param cols the names of the columns to search frequent items in. + * @return A Local DataFrame with the Array of frequent items for each column. */ def freqItems(cols: Seq[String]): DataFrame = { FrequentItems.singlePassFreqItems(df, cols, 0.01) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala index f2bb4534c75eb..830b5017717b5 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala @@ -22,7 +22,7 @@ import scala.reflect.runtime.universe.{TypeTag, typeTag} import org.apache.spark.annotation.Experimental import org.apache.spark.sql.catalyst.ScalaReflection -import org.apache.spark.sql.catalyst.analysis.{UnresolvedAttribute, UnresolvedFunction, Star} +import org.apache.spark.sql.catalyst.analysis.{UnresolvedFunction, Star} import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.expressions.mathfuncs._ import org.apache.spark.sql.types._ @@ -86,10 +86,10 @@ object functions { /** * Returns a sort expression based on ascending order of the column. - * {{ + * {{{ * // Sort by dept in ascending order, and then age in descending order. * df.sort(asc("dept"), desc("age")) - * }} + * }}} * * @group sort_funcs */ @@ -97,10 +97,10 @@ object functions { /** * Returns a sort expression based on the descending order of the column. - * {{ + * {{{ * // Sort by dept in ascending order, and then age in descending order. * df.sort(asc("dept"), desc("age")) - * }} + * }}} * * @group sort_funcs */ @@ -353,13 +353,13 @@ object functions { /** * Inversion of boolean expression, i.e. NOT. - * {{ + * {{{ * // Scala: select rows that are not active (isActive === false) * df.filter( !df("isActive") ) * * // Java: * df.filter( not(df.col("isActive")) ); - * }} + * }}} * * @group normal_funcs */