Merge pull request #1554 from apache/master

Create a new pull request by comparing changes across two branches
GulajavaMinistudio · Sep 11, 2023 · 46fe32d · 46fe32d
2 parents 2962b0c + 3d119a5
commit 46fe32d
Show file tree

Hide file tree

Showing 45 changed files with 1,267 additions and 1,023 deletions.
diff --git a/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/functions.scala b/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/functions.scala
@@ -2624,7 +2624,7 @@ object functions {
    * @group math_funcs
    * @since 3.5.0
    */
-  def ln(e: Column): Column = log(e)
+  def ln(e: Column): Column = Column.fn("ln", e)
 
   /**
    * Computes the natural logarithm of the given value.
@@ -3503,7 +3503,7 @@ object functions {
       mode: Column,
       padding: Column,
       aad: Column): Column =
-    Column.fn("aes_encrypt", input, key, mode, padding, aad)
+    Column.fn("aes_decrypt", input, key, mode, padding, aad)
 
   /**
    * Returns a decrypted value of `input`.
@@ -3515,7 +3515,7 @@ object functions {
    * @since 3.5.0
    */
   def aes_decrypt(input: Column, key: Column, mode: Column, padding: Column): Column =
-    Column.fn("aes_encrypt", input, key, mode, padding)
+    Column.fn("aes_decrypt", input, key, mode, padding)
 
   /**
    * Returns a decrypted value of `input`.
@@ -3527,7 +3527,7 @@ object functions {
    * @since 3.5.0
    */
   def aes_decrypt(input: Column, key: Column, mode: Column): Column =
-    Column.fn("aes_encrypt", input, key, mode)
+    Column.fn("aes_decrypt", input, key, mode)
 
   /**
    * Returns a decrypted value of `input`.
@@ -3539,7 +3539,7 @@ object functions {
    * @since 3.5.0
    */
   def aes_decrypt(input: Column, key: Column): Column =
-    Column.fn("aes_encrypt", input, key)
+    Column.fn("aes_decrypt", input, key)
 
   /**
    * This is a special version of `aes_decrypt` that performs the same operation, but returns a

diff --git a/...onnect/common/src/test/resources/query-tests/explain-results/function_aes_decrypt.explain b/...onnect/common/src/test/resources/query-tests/explain-results/function_aes_decrypt.explain
@@ -1,2 +1,2 @@
-Project [staticinvoke(class org.apache.spark.sql.catalyst.expressions.ExpressionImplUtils, BinaryType, aesEncrypt, cast(g#0 as binary), cast(g#0 as binary), GCM, DEFAULT, cast( as binary), cast( as binary), BinaryType, BinaryType, StringType, StringType, BinaryType, BinaryType, true, true, true) AS aes_encrypt(g, g, GCM, DEFAULT, , )#0]
+Project [staticinvoke(class org.apache.spark.sql.catalyst.expressions.ExpressionImplUtils, BinaryType, aesDecrypt, cast(g#0 as binary), cast(g#0 as binary), GCM, DEFAULT, cast( as binary), BinaryType, BinaryType, StringType, StringType, BinaryType, true, true, true) AS aes_decrypt(g, g, GCM, DEFAULT, )#0]
 +- LocalRelation <empty>, [id#0L, a#0, b#0, d#0, e#0, f#0, g#0]
diff --git a/...mon/src/test/resources/query-tests/explain-results/function_aes_decrypt_with_mode.explain b/...mon/src/test/resources/query-tests/explain-results/function_aes_decrypt_with_mode.explain
@@ -1,2 +1,2 @@
-Project [staticinvoke(class org.apache.spark.sql.catalyst.expressions.ExpressionImplUtils, BinaryType, aesEncrypt, cast(g#0 as binary), cast(g#0 as binary), g#0, DEFAULT, cast( as binary), cast( as binary), BinaryType, BinaryType, StringType, StringType, BinaryType, BinaryType, true, true, true) AS aes_encrypt(g, g, g, DEFAULT, , )#0]
+Project [staticinvoke(class org.apache.spark.sql.catalyst.expressions.ExpressionImplUtils, BinaryType, aesDecrypt, cast(g#0 as binary), cast(g#0 as binary), g#0, DEFAULT, cast( as binary), BinaryType, BinaryType, StringType, StringType, BinaryType, true, true, true) AS aes_decrypt(g, g, g, DEFAULT, )#0]
 +- LocalRelation <empty>, [id#0L, a#0, b#0, d#0, e#0, f#0, g#0]
diff --git a/...test/resources/query-tests/explain-results/function_aes_decrypt_with_mode_padding.explain b/...test/resources/query-tests/explain-results/function_aes_decrypt_with_mode_padding.explain
@@ -1,2 +1,2 @@
-Project [staticinvoke(class org.apache.spark.sql.catalyst.expressions.ExpressionImplUtils, BinaryType, aesEncrypt, cast(g#0 as binary), cast(g#0 as binary), g#0, g#0, cast( as binary), cast( as binary), BinaryType, BinaryType, StringType, StringType, BinaryType, BinaryType, true, true, true) AS aes_encrypt(g, g, g, g, , )#0]
+Project [staticinvoke(class org.apache.spark.sql.catalyst.expressions.ExpressionImplUtils, BinaryType, aesDecrypt, cast(g#0 as binary), cast(g#0 as binary), g#0, g#0, cast( as binary), BinaryType, BinaryType, StringType, StringType, BinaryType, true, true, true) AS aes_decrypt(g, g, g, g, )#0]
 +- LocalRelation <empty>, [id#0L, a#0, b#0, d#0, e#0, f#0, g#0]
diff --git a/.../resources/query-tests/explain-results/function_aes_decrypt_with_mode_padding_aad.explain b/.../resources/query-tests/explain-results/function_aes_decrypt_with_mode_padding_aad.explain
@@ -1,2 +1,2 @@
-Project [staticinvoke(class org.apache.spark.sql.catalyst.expressions.ExpressionImplUtils, BinaryType, aesEncrypt, cast(g#0 as binary), cast(g#0 as binary), g#0, g#0, cast(g#0 as binary), cast( as binary), BinaryType, BinaryType, StringType, StringType, BinaryType, BinaryType, true, true, true) AS aes_encrypt(g, g, g, g, g, )#0]
+Project [staticinvoke(class org.apache.spark.sql.catalyst.expressions.ExpressionImplUtils, BinaryType, aesDecrypt, cast(g#0 as binary), cast(g#0 as binary), g#0, g#0, cast(g#0 as binary), BinaryType, BinaryType, StringType, StringType, BinaryType, true, true, true) AS aes_decrypt(g, g, g, g, g)#0]
 +- LocalRelation <empty>, [id#0L, a#0, b#0, d#0, e#0, f#0, g#0]
diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_ln.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_ln.explain
@@ -1,2 +1,2 @@
-Project [LOG(E(), b#0) AS LOG(E(), b)#0]
+Project [ln(b#0) AS ln(b)#0]
 +- LocalRelation <empty>, [id#0L, a#0, b#0, d#0, e#0, f#0, g#0]
diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_aes_decrypt.json b/connector/connect/common/src/test/resources/query-tests/queries/function_aes_decrypt.json
@@ -13,7 +13,7 @@
     },
     "expressions": [{
       "unresolvedFunction": {
-        "functionName": "aes_encrypt",
+        "functionName": "aes_decrypt",
         "arguments": [{
           "unresolvedAttribute": {
             "unparsedIdentifier": "g"

diff --git a/...ctor/connect/common/src/test/resources/query-tests/queries/function_aes_decrypt.proto.bin b/...ctor/connect/common/src/test/resources/query-tests/queries/function_aes_decrypt.proto.bin
diff --git a/...connect/common/src/test/resources/query-tests/queries/function_aes_decrypt_with_mode.json b/...connect/common/src/test/resources/query-tests/queries/function_aes_decrypt_with_mode.json
@@ -13,7 +13,7 @@
     },
     "expressions": [{
       "unresolvedFunction": {
-        "functionName": "aes_encrypt",
+        "functionName": "aes_decrypt",
         "arguments": [{
           "unresolvedAttribute": {
             "unparsedIdentifier": "g"

diff --git a/...ct/common/src/test/resources/query-tests/queries/function_aes_decrypt_with_mode.proto.bin b/...ct/common/src/test/resources/query-tests/queries/function_aes_decrypt_with_mode.proto.bin
diff --git a/...common/src/test/resources/query-tests/queries/function_aes_decrypt_with_mode_padding.json b/...common/src/test/resources/query-tests/queries/function_aes_decrypt_with_mode_padding.json
@@ -13,7 +13,7 @@
     },
     "expressions": [{
       "unresolvedFunction": {
-        "functionName": "aes_encrypt",
+        "functionName": "aes_decrypt",
         "arguments": [{
           "unresolvedAttribute": {
             "unparsedIdentifier": "g"

diff --git a/...n/src/test/resources/query-tests/queries/function_aes_decrypt_with_mode_padding.proto.bin b/...n/src/test/resources/query-tests/queries/function_aes_decrypt_with_mode_padding.proto.bin
diff --git a/...on/src/test/resources/query-tests/queries/function_aes_decrypt_with_mode_padding_aad.json b/...on/src/test/resources/query-tests/queries/function_aes_decrypt_with_mode_padding_aad.json
@@ -13,7 +13,7 @@
     },
     "expressions": [{
       "unresolvedFunction": {
-        "functionName": "aes_encrypt",
+        "functionName": "aes_decrypt",
         "arguments": [{
           "unresolvedAttribute": {
             "unparsedIdentifier": "g"

diff --git a/...c/test/resources/query-tests/queries/function_aes_decrypt_with_mode_padding_aad.proto.bin b/...c/test/resources/query-tests/queries/function_aes_decrypt_with_mode_padding_aad.proto.bin
diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_ln.json b/connector/connect/common/src/test/resources/query-tests/queries/function_ln.json
@@ -13,7 +13,7 @@
     },
     "expressions": [{
       "unresolvedFunction": {
-        "functionName": "log",
+        "functionName": "ln",
         "arguments": [{
           "unresolvedAttribute": {
             "unparsedIdentifier": "b"

diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_ln.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_ln.proto.bin
diff --git a/core/src/main/resources/org/apache/spark/ui/static/graphlib-dot.min.js b/core/src/main/resources/org/apache/spark/ui/static/graphlib-dot.min.js
diff --git a/docs/_layouts/redirect.html b/docs/_layouts/redirect.html
@@ -0,0 +1,28 @@
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
+<!DOCTYPE html>
+<html lang="en-US">
+<meta charset="utf-8">
+<title>Redirecting&hellip;</title>
+<link rel="canonical" href="{{ page.redirect.to }}.html">
+<script>location="{{ page.redirect.to }}.html"</script>
+<meta http-equiv="refresh" content="0; url={{ page.redirect.to }}.html">
+<meta name="robots" content="noindex">
+<h1>Redirecting&hellip;</h1>
+<a href="{{ page.redirect.to }}.html">Click here if you are not redirected.</a>
+</html>
diff --git a/docs/css/custom.css b/docs/css/custom.css
@@ -9,6 +9,7 @@ body {
   overflow-wrap: anywhere;
   overflow-x: hidden;
   padding-top: 80px;
+  padding-bottom: 20px;
 }
 
 a {

diff --git a/docs/running-on-mesos.md b/docs/running-on-mesos.md
@@ -136,7 +136,7 @@ If you want to deploy a Spark Application into a Mesos cluster that is running i
 - `LIBPROCESS_SSL_KEY_FILE=pathToKeyFile.key` path to key
 - `LIBPROCESS_SSL_CERT_FILE=pathToCRTFile.crt` the certificate file to be used
 
-All options can be found at http://mesos.apache.org/documentation/latest/ssl/
+All options can be found at [http://mesos.apache.org/documentation/latest/ssl/](http://mesos.apache.org/documentation/latest/ssl/)
 
 Then submit happens as described in Client mode or Cluster mode below
 
@@ -579,7 +579,7 @@ See the [configuration page](configuration.html) for information on Spark config
     This only affects docker containers, and must be one of "docker"
     or "mesos".  Mesos supports two types of
     containerizers for docker: the "docker" containerizer, and the preferred
-    "mesos" containerizer.  Read more here: http://mesos.apache.org/documentation/latest/container-image/
+    "mesos" containerizer.  Read more <a href="http://mesos.apache.org/documentation/latest/container-image/">here</a>.
   </td>
   <td>2.1.0</td>
 </tr>

diff --git a/docs/running-on-yarn.md b/docs/running-on-yarn.md
@@ -163,7 +163,7 @@ To use a custom metrics.properties for the application master and executors, upd
     Amount of resource to use for the YARN Application Master in client mode.
     In cluster mode, use <code>spark.yarn.driver.resource.&lt;resource-type&gt;.amount</code> instead.
     Please note that this feature can be used only with YARN 3.0+
-    For reference, see YARN Resource Model documentation: https://hadoop.apache.org/docs/current/hadoop-yarn/hadoop-yarn-site/ResourceModel.html
+    For reference, see <a href="https://hadoop.apache.org/docs/current/hadoop-yarn/hadoop-yarn-site/ResourceModel.html">YARN Resource Model documentation</a>
     <p/>
     Example:
     To request GPU resources from YARN, use: <code>spark.yarn.am.resource.yarn.io/gpu.amount</code>
@@ -185,7 +185,7 @@ To use a custom metrics.properties for the application master and executors, upd
   <td>
     Amount of resource to use for the YARN Application Master in cluster mode.
     Please note that this feature can be used only with YARN 3.0+
-    For reference, see YARN Resource Model documentation: https://hadoop.apache.org/docs/current/hadoop-yarn/hadoop-yarn-site/ResourceModel.html
+    For reference, see <a href="https://hadoop.apache.org/docs/current/hadoop-yarn/hadoop-yarn-site/ResourceModel.html">YARN Resource Model documentation</a>
     <p/>
     Example:
     To request GPU resources from YARN, use: <code>spark.yarn.driver.resource.yarn.io/gpu.amount</code>
@@ -198,7 +198,7 @@ To use a custom metrics.properties for the application master and executors, upd
   <td>
     Amount of resource to use per executor process.
     Please note that this feature can be used only with YARN 3.0+
-    For reference, see YARN Resource Model documentation: https://hadoop.apache.org/docs/current/hadoop-yarn/hadoop-yarn-site/ResourceModel.html
+    For reference, see <a href="https://hadoop.apache.org/docs/current/hadoop-yarn/hadoop-yarn-site/ResourceModel.html">YARN Resource Model documentation</a>
     <p/>
     Example:
     To request GPU resources from YARN, use: <code>spark.yarn.executor.resource.yarn.io/gpu.amount</code>

diff --git a/python/pyspark/pandas/groupby.py b/python/pyspark/pandas/groupby.py
@@ -857,10 +857,10 @@ def sum(self, numeric_only: Optional[bool] = True, min_count: int = 0) -> FrameL
         ...                    "C": [3, 4, 3, 4], "D": ["a", "a", "b", "a"]})
 
         >>> df.groupby("A").sum().sort_index()
-           B  C
+           B  C   D
         A
-        1  1  6
-        2  1  8
+        1  1  6  ab
+        2  1  8  aa
 
         >>> df.groupby("D").sum().sort_index()
            A  B   C
@@ -900,17 +900,17 @@ def sum(self, numeric_only: Optional[bool] = True, min_count: int = 0) -> FrameL
             unsupported = [
                 col.name
                 for col in self._agg_columns
-                if not isinstance(col.spark.data_type, (NumericType, BooleanType))
+                if not isinstance(col.spark.data_type, (NumericType, BooleanType, StringType))
             ]
             if len(unsupported) > 0:
                 log_advice(
-                    "GroupBy.sum() can only support numeric and bool columns even if"
+                    "GroupBy.sum() can only support numeric, bool and string columns even if"
                     f"numeric_only=False, skip unsupported columns: {unsupported}"
                 )
 
         return self._reduce_for_stat_function(
             F.sum,
-            accepted_spark_types=(NumericType, BooleanType),
+            accepted_spark_types=(NumericType, BooleanType, StringType),
             bool_to_numeric=True,
             min_count=min_count,
         )
@@ -3534,7 +3534,21 @@ def _reduce_for_stat_function(
             for label in psdf._internal.column_labels:
                 psser = psdf._psser_for(label)
                 input_scol = psser._dtype_op.nan_to_null(psser).spark.column
-                output_scol = sfun(input_scol)
+                if sfun.__name__ == "sum" and isinstance(
+                    psdf._internal.spark_type_for(label), StringType
+                ):
+                    input_scol_name = psser._internal.data_spark_column_names[0]
+                    # Sort data with natural order column to ensure order of data
+                    sorted_array = F.array_sort(
+                        F.collect_list(F.struct(NATURAL_ORDER_COLUMN_NAME, input_scol))
+                    )
+
+                    # Using transform to extract strings
+                    output_scol = F.concat_ws(
+                        "", F.transform(sorted_array, lambda x: x.getField(input_scol_name))
+                    )
+                else:
+                    output_scol = sfun(input_scol)
 
                 if min_count > 0:
                     output_scol = F.when(
@@ -3591,7 +3605,9 @@ def _prepare_reduce(
             ):
                 agg_columns.append(psser)
         sdf = self._psdf._internal.spark_frame.select(
-            *groupkey_scols, *[psser.spark.column for psser in agg_columns]
+            *groupkey_scols,
+            *[psser.spark.column for psser in agg_columns],
+            NATURAL_ORDER_COLUMN_NAME,
         )
         internal = InternalFrame(
             spark_frame=sdf,

diff --git a/python/pyspark/pandas/tests/groupby/test_groupby.py b/python/pyspark/pandas/tests/groupby/test_groupby.py
@@ -59,9 +59,6 @@ def test_groupby_simple(self):
             },
             index=[0, 1, 3, 5, 6, 8, 9, 9, 9],
         )
-        if LooseVersion(pd.__version__) >= LooseVersion("2.0.0"):
-            # TODO(SPARK-43295): Make DataFrameGroupBy.sum support for string type columns
-            pdf = pdf[["a", "b", "c", "e"]]
         psdf = ps.from_pandas(pdf)
 
         for as_index in [True, False]:
@@ -180,9 +177,6 @@ def sort(df):
             index=[0, 1, 3, 5, 6, 8, 9, 9, 9],
         )
         psdf = ps.from_pandas(pdf)
-        if LooseVersion(pd.__version__) >= LooseVersion("2.0.0"):
-            # TODO(SPARK-43295): Make DataFrameGroupBy.sum support for string type columns
-            pdf = pdf[[10, 20, 30]]
 
         for as_index in [True, False]:
             if as_index:

diff --git a/python/pyspark/pandas/tests/groupby/test_stat.py b/python/pyspark/pandas/tests/groupby/test_stat.py
@@ -113,7 +113,7 @@ def test_basic_stat_funcs(self):
         # self._test_stat_func(lambda groupby_obj: groupby_obj.sum(), check_exact=False)
         self.assert_eq(
             psdf.groupby("A").sum().sort_index(),
-            pdf.groupby("A").sum(numeric_only=True).sort_index(),
+            pdf.groupby("A").sum().sort_index(),
             check_exact=False,
         )
 

diff --git a/python/pyspark/pandas/tests/test_ops_on_diff_frames_groupby.py b/python/pyspark/pandas/tests/test_ops_on_diff_frames_groupby.py
@@ -66,7 +66,7 @@ def sort(df):
 
                 self.assert_eq(
                     sort(psdf1.groupby(psdf2.a, as_index=as_index).sum()),
-                    sort(pdf1.groupby(pdf2.a, as_index=as_index).sum(numeric_only=True)),
+                    sort(pdf1.groupby(pdf2.a, as_index=as_index).sum()),
                     almost=as_index,
                 )
 
@@ -93,7 +93,7 @@ def test_groupby_multiindex_columns(self):
 
         self.assert_eq(
             psdf1.groupby(psdf2[("x", "a")]).sum().sort_index(),
-            pdf1.groupby(pdf2[("x", "a")]).sum(numeric_only=True).sort_index(),
+            pdf1.groupby(pdf2[("x", "a")]).sum().sort_index(),
         )
 
         self.assert_eq(
@@ -102,7 +102,7 @@ def test_groupby_multiindex_columns(self):
             .sort_values(("y", "c"))
             .reset_index(drop=True),
             pdf1.groupby(pdf2[("x", "a")], as_index=False)
-            .sum(numeric_only=True)
+            .sum()
             .sort_values(("y", "c"))
             .reset_index(drop=True),
         )