hablapps · alfonsorr · Aug 19, 2021 · Aug 19, 2021 · Aug 19, 2021
@@ -1,4 +1,4 @@
-version = "2.7.5"
+version = "3.0.0"
 align.preset = more
 maxColumn = 80
 assumeStandardLibraryStripMargin = true

@@ -1,5 +1,5 @@
 ThisBuild / organization := "org.hablapps"
-ThisBuild / homepage := Some(url("https://github.com/hablapps/doric"))
+ThisBuild / homepage     := Some(url("https://github.com/hablapps/doric"))
 ThisBuild / licenses := List(
   "Apache-2.0" -> url("https://www.apache.org/licenses/LICENSE-2.0")
 )

@@ -1,4 +1,4 @@
 logLevel := Level.Warn
 
-addSbtPlugin("com.geirsson" % "sbt-ci-release" % "1.5.7")
-addSbtPlugin("org.scalameta"  % "sbt-scalafmt" % "2.4.3")
+addSbtPlugin("com.geirsson"  % "sbt-ci-release" % "1.5.7")
+addSbtPlugin("org.scalameta" % "sbt-scalafmt"   % "2.4.3")
@@ -9,23 +9,24 @@ trait AggregationOps extends RelationalGroupedDatasetDoricInterface {
   implicit class DataframeAggSyntax(df: Dataset[_]) {
 
     /**
-      * Groups the Dataset using the specified columns, so we can run aggregation on them. See
+      * Groups the Dataset using the specified columns, so we can run
+      * aggregation on them. See
       */
     def groupBy(cols: DoricColumn[_]*): RelationalGroupedDataset = {
       sparkGroupBy(df.toDF(), cols: _*).returnOrThrow("groupBy")
     }
 
     /**
-      * Create a multi-dimensional cube for the current Dataset using the specified columns,
-      * so we can run aggregation on them.
+      * Create a multi-dimensional cube for the current Dataset using the
+      * specified columns, so we can run aggregation on them.
       */
     def cube(cols: DoricColumn[_]*): RelationalGroupedDataset = {
       sparkCube(df.toDF(), cols: _*).returnOrThrow("cube")
     }
 
     /**
-      * Create a multi-dimensional rollup for the current Dataset using the specified columns,
-      * so we can run aggregation on them.
+      * Create a multi-dimensional rollup for the current Dataset using the
+      * specified columns, so we can run aggregation on them.
       */
     def rollup(cols: DoricColumn[_]*): RelationalGroupedDataset = {
       sparkRollup(df.toDF(), cols: _*).returnOrThrow("rollup")
@@ -35,21 +36,26 @@ trait AggregationOps extends RelationalGroupedDatasetDoricInterface {
   implicit class RelationalGroupedDatasetSem(rel: RelationalGroupedDataset) {
 
     /**
-      * Compute aggregates by specifying a series of aggregate columns. Note that this function by
-      * default retains the grouping columns in its output. To not retain grouping columns, set
+      * Compute aggregates by specifying a series of aggregate columns. Note
+      * that this function by default retains the grouping columns in its
+      * output. To not retain grouping columns, set
       * `spark.sql.retainGroupColumns` to false.
       */
     def agg(col: DoricColumn[_], cols: DoricColumn[_]*): DataFrame =
       sparkAgg(rel, col, cols: _*).returnOrThrow("agg")
 
     /**
-      * Pivots a column of the current `DataFrame` and performs the specified aggregation.
-      * There are two versions of pivot function: one that requires the caller to specify the list
-      * of distinct values to pivot on, and one that does not. The latter is more concise but less
-      * efficient, because Spark needs to first compute the list of distinct values internally.
-      * @param expr doric column to pivot
-      * @param values the values of the column to extract
-      * @tparam T The type of the column and parameters
+      * Pivots a column of the current `DataFrame` and performs the specified
+      * aggregation. There are two versions of pivot function: one that requires
+      * the caller to specify the list of distinct values to pivot on, and one
+      * that does not. The latter is more concise but less efficient, because
+      * Spark needs to first compute the list of distinct values internally.
+      * @param expr
+      *   doric column to pivot
+      * @param values
+      *   the values of the column to extract
+      * @tparam T
+      *   The type of the column and parameters
       */
     def pivot[T](expr: DoricColumn[T])(
         values: Seq[T]

@@ -9,21 +9,29 @@ trait CollectOps {
 
     /**
       * Collects the provided columns of the dataframe
-      * @param col1 the Doric column to collect from the dataframe
-      * @tparam T1 the type of the column to collect, must have an Spark `Encoder`
-      * @return The array of the selected column
+      * @param col1
+      *   the Doric column to collect from the dataframe
+      * @tparam T1
+      *   the type of the column to collect, must have an Spark `Encoder`
+      * @return
+      *   The array of the selected column
       */
     def collectCols[T1: Encoder](col1: DoricColumn[T1]): Array[T1] = {
       df.select(col1).as[T1].collect()
     }
 
     /**
       * Collects the provided columns of the dataframe
-      * @param col1 the Doric column to collect from the dataframe
-      * @param col2 other Doric column to collect from the dataframe
-      * @tparam T1 the type of the column to collect, must have an Spark `Encoder`
-      * @tparam T2 the type of the second column to collect, must have an Spark `Encoder`
-      * @return The array of the selected columns
+      * @param col1
+      *   the Doric column to collect from the dataframe
+      * @param col2
+      *   other Doric column to collect from the dataframe
+      * @tparam T1
+      *   the type of the column to collect, must have an Spark `Encoder`
+      * @tparam T2
+      *   the type of the second column to collect, must have an Spark `Encoder`
+      * @return
+      *   The array of the selected columns
       */
     def collectCols[T1, T2](
         col1: DoricColumn[T1],
@@ -34,13 +42,20 @@ trait CollectOps {
 
     /**
       * Collects the provided columns of the dataframe
-      * @param col1 the Doric column to collect from the dataframe
-      * @param col2 second Doric column to collect from the dataframe
-      * @param col3 third Doric column to collect from the dataframe
-      * @tparam T1 the type of the column to collect, must have an Spark `Encoder`
-      * @tparam T2 the type of the second column to collect, must have an Spark `Encoder`
-      * @tparam T3 the type of the third column to collect, must have an Spark `Encoder`
-      * @return The array of the selected columns
+      * @param col1
+      *   the Doric column to collect from the dataframe
+      * @param col2
+      *   second Doric column to collect from the dataframe
+      * @param col3
+      *   third Doric column to collect from the dataframe
+      * @tparam T1
+      *   the type of the column to collect, must have an Spark `Encoder`
+      * @tparam T2
+      *   the type of the second column to collect, must have an Spark `Encoder`
+      * @tparam T3
+      *   the type of the third column to collect, must have an Spark `Encoder`
+      * @return
+      *   The array of the selected columns
       */
     def collectCols[T1, T2, T3](
         col1: DoricColumn[T1],
@@ -52,15 +67,24 @@ trait CollectOps {
 
     /**
       * Collects the provided columns of the dataframe
-      * @param col1 the Doric column to collect from the dataframe
-      * @param col2 second Doric column to collect from the dataframe
-      * @param col3 third Doric column to collect from the dataframe
-      * @param col4 forth Doric column to collect from the dataframe
-      * @tparam T1 the type of the column to collect, must have an Spark `Encoder`
-      * @tparam T2 the type of the second column to collect, must have an Spark `Encoder`
-      * @tparam T3 the type of the third column to collect, must have an Spark `Encoder`
-      * @tparam T4 the type of the forth column to collect, must have an Spark `Encoder`
-      * @return The array of the selected columns
+      * @param col1
+      *   the Doric column to collect from the dataframe
+      * @param col2
+      *   second Doric column to collect from the dataframe
+      * @param col3
+      *   third Doric column to collect from the dataframe
+      * @param col4
+      *   forth Doric column to collect from the dataframe
+      * @tparam T1
+      *   the type of the column to collect, must have an Spark `Encoder`
+      * @tparam T2
+      *   the type of the second column to collect, must have an Spark `Encoder`
+      * @tparam T3
+      *   the type of the third column to collect, must have an Spark `Encoder`
+      * @tparam T4
+      *   the type of the forth column to collect, must have an Spark `Encoder`
+      * @return
+      *   The array of the selected columns
       */
     def collectCols[T1, T2, T3, T4](
         col1: DoricColumn[T1],

@@ -14,7 +14,7 @@ case class DoricMultiError(
 
     implicit class StringOps(s: String) {
       private val indentation = "  "
-      def withTabs: String    = indentation + s.replaceAll("\n", s"\n$indentation")
+      def withTabs: String = indentation + s.replaceAll("\n", s"\n$indentation")
     }
 
     implicit class JoinCases(errors: NonEmptyChain[DoricSingleError]) {

@@ -9,20 +9,25 @@ trait JoinOps {
   implicit class DataframeJoinSyntax[A](df: Dataset[A]) {
 
     /**
-      * Join with another `DataFrame`, using the given doric columns. The following performs
-      * a full outer join between `df1` and `df2` by the key `dfKey` that must be string type.
+      * Join with another `DataFrame`, using the given doric columns. The
+      * following performs a full outer join between `df1` and `df2` by the key
+      * `dfKey` that must be string type.
       *
       * {{{
       *   df1.join(df2,"outer", colString("dfKey"))
       * }}}
       *
-      * @param df2      Right side of the join.
-      * @param joinType Type of join to perform. Default `inner`. Must be one of:
-      *                 `inner`, `cross`, `outer`, `full`, `fullouter`, `full_outer`, `left`,
-      *                 `leftouter`, `left_outer`, `right`, `rightouter`, `right_outer`,
-      *                 `semi`, `leftsemi`, `left_semi`, `anti`, `leftanti`, `left_anti`.
-      * @param col      Doric column that must be in both dataframes.
-      * @param cols     rest of doric columns that must be in both dataframes.
+      * @param df2
+      *   Right side of the join.
+      * @param joinType
+      *   Type of join to perform. Default `inner`. Must be one of: `inner`,
+      *   `cross`, `outer`, `full`, `fullouter`, `full_outer`, `left`,
+      *   `leftouter`, `left_outer`, `right`, `rightouter`, `right_outer`,
+      *   `semi`, `leftsemi`, `left_semi`, `anti`, `leftanti`, `left_anti`.
+      * @param col
+      *   Doric column that must be in both dataframes.
+      * @param cols
+      *   rest of doric columns that must be in both dataframes.
       */
     def join(
         df2: Dataset[_],
@@ -48,20 +53,24 @@ trait JoinOps {
     }
 
     /**
-      * Join with another `DataFrame`, using the given doric columns. The following performs
-      * a full outer join between `df1` with key `dfKey1` and `df2` with key `dfKey2` that must be string type.
+      * Join with another `DataFrame`, using the given doric columns. The
+      * following performs a full outer join between `df1` with key `dfKey1` and
+      * `df2` with key `dfKey2` that must be string type.
       *
       * {{{
       *   val joinColumn = Left.colString("dfKey1") === Right.colString("dfKey2")
       *   df1.join(df2, joinColumn, "outer")
       * }}}
       *
-      * @param df2      Right side of the join.
-      * @param colum    Doric join column that must be in both dataframes.
-      * @param joinType Type of join to perform. Default `inner`. Must be one of:
-      *                 `inner`, `cross`, `outer`, `full`, `fullouter`, `full_outer`, `left`,
-      *                 `leftouter`, `left_outer`, `right`, `rightouter`, `right_outer`,
-      *                 `semi`, `leftsemi`, `left_semi`, `anti`, `leftanti`, `left_anti`.
+      * @param df2
+      *   Right side of the join.
+      * @param colum
+      *   Doric join column that must be in both dataframes.
+      * @param joinType
+      *   Type of join to perform. Default `inner`. Must be one of: `inner`,
+      *   `cross`, `outer`, `full`, `fullouter`, `full_outer`, `left`,
+      *   `leftouter`, `left_outer`, `right`, `rightouter`, `right_outer`,
+      *   `semi`, `leftsemi`, `left_semi`, `anti`, `leftanti`, `left_anti`.
       */
     def join(
         df2: Dataset[_],
@@ -75,17 +84,21 @@ trait JoinOps {
     }
 
     /**
-      * Join with another `DataFrame`, using the given doric columns. The following performs
-      * a inner join between `df1` and `df2` by the key `dfKey` that must be string type.
-      * It drops in the return dataframes the `dfKey` column of the right dataframe.
+      * Join with another `DataFrame`, using the given doric columns. The
+      * following performs a inner join between `df1` and `df2` by the key
+      * `dfKey` that must be string type. It drops in the return dataframes the
+      * `dfKey` column of the right dataframe.
       *
       * {{{
       *   df1.innerJoinKeepLeftKeys(df2, colString("dfKey"))
       * }}}
       *
-      * @param df2     Right side of the join.
-      * @param column  Doric column that must be in both dataframes.
-      * @param columns rest of doric columns that must be in both dataframes.
+      * @param df2
+      *   Right side of the join.
+      * @param column
+      *   Doric column that must be in both dataframes.
+      * @param columns
+      *   rest of doric columns that must be in both dataframes.
       */
     def innerJoinKeepLeftKeys(
         df2: Dataset[_],

@@ -10,15 +10,18 @@ trait TransformOps {
   implicit class DataframeTransformationSyntax[A](df: Dataset[A]) {
 
     /**
-      * Returns a new Dataset by adding a column or replacing the existing column that has
-      * the same name.
+      * Returns a new Dataset by adding a column or replacing the existing
+      * column that has the same name.
       *
-      * `column`'s expression must only refer to attributes supplied by this Dataset. It is an
-      * error to add a column that refers to some other Dataset.
+      * `column`'s expression must only refer to attributes supplied by this
+      * Dataset. It is an error to add a column that refers to some other
+      * Dataset.
       *
-      * @note this method introduces a projection internally. Therefore, calling it multiple times,
-      *       for instance, via loops in order to add multiple columns can generate big plans which
-      *       can cause performance issues and even `StackOverflowException`.
+      * @note
+      *   this method introduces a projection internally. Therefore, calling it
+      *   multiple times, for instance, via loops in order to add multiple
+      *   columns can generate big plans which can cause performance issues and
+      *   even `StackOverflowException`.
       */
     def withColumn[T](colName: String, col: DoricColumn[T]): DataFrame = {
       col.elem
@@ -35,7 +38,8 @@ trait TransformOps {
       *   peopleDs.where(colInt("age") > 15)
       * }}}
       *
-      * @param condition BooleanColumn that let pass elements that are true
+      * @param condition
+      *   BooleanColumn that let pass elements that are true
       */
     def filter(condition: BooleanColumn): Dataset[A] = {
       condition.elem
@@ -52,7 +56,8 @@ trait TransformOps {
       *   peopleDs.where(colInt("age") > 15)
       * }}}
       *
-      * @param condition BooleanColumn that let pass elements that are true
+      * @param condition
+      *   BooleanColumn that let pass elements that are true
       */
     def where(condition: BooleanColumn): Dataset[A] = {
       condition.elem