Skip to content

Commit

Permalink
Merge pull request #1554 from apache/master
Browse files Browse the repository at this point in the history
Create a new pull request by comparing changes across two branches
  • Loading branch information
GulajavaMinistudio authored Sep 11, 2023
2 parents 2962b0c + 3d119a5 commit 46fe32d
Show file tree
Hide file tree
Showing 45 changed files with 1,267 additions and 1,023 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -2624,7 +2624,7 @@ object functions {
* @group math_funcs
* @since 3.5.0
*/
def ln(e: Column): Column = log(e)
def ln(e: Column): Column = Column.fn("ln", e)

/**
* Computes the natural logarithm of the given value.
Expand Down Expand Up @@ -3503,7 +3503,7 @@ object functions {
mode: Column,
padding: Column,
aad: Column): Column =
Column.fn("aes_encrypt", input, key, mode, padding, aad)
Column.fn("aes_decrypt", input, key, mode, padding, aad)

/**
* Returns a decrypted value of `input`.
Expand All @@ -3515,7 +3515,7 @@ object functions {
* @since 3.5.0
*/
def aes_decrypt(input: Column, key: Column, mode: Column, padding: Column): Column =
Column.fn("aes_encrypt", input, key, mode, padding)
Column.fn("aes_decrypt", input, key, mode, padding)

/**
* Returns a decrypted value of `input`.
Expand All @@ -3527,7 +3527,7 @@ object functions {
* @since 3.5.0
*/
def aes_decrypt(input: Column, key: Column, mode: Column): Column =
Column.fn("aes_encrypt", input, key, mode)
Column.fn("aes_decrypt", input, key, mode)

/**
* Returns a decrypted value of `input`.
Expand All @@ -3539,7 +3539,7 @@ object functions {
* @since 3.5.0
*/
def aes_decrypt(input: Column, key: Column): Column =
Column.fn("aes_encrypt", input, key)
Column.fn("aes_decrypt", input, key)

/**
* This is a special version of `aes_decrypt` that performs the same operation, but returns a
Expand Down
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
Project [staticinvoke(class org.apache.spark.sql.catalyst.expressions.ExpressionImplUtils, BinaryType, aesEncrypt, cast(g#0 as binary), cast(g#0 as binary), GCM, DEFAULT, cast( as binary), cast( as binary), BinaryType, BinaryType, StringType, StringType, BinaryType, BinaryType, true, true, true) AS aes_encrypt(g, g, GCM, DEFAULT, , )#0]
Project [staticinvoke(class org.apache.spark.sql.catalyst.expressions.ExpressionImplUtils, BinaryType, aesDecrypt, cast(g#0 as binary), cast(g#0 as binary), GCM, DEFAULT, cast( as binary), BinaryType, BinaryType, StringType, StringType, BinaryType, true, true, true) AS aes_decrypt(g, g, GCM, DEFAULT, )#0]
+- LocalRelation <empty>, [id#0L, a#0, b#0, d#0, e#0, f#0, g#0]
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
Project [staticinvoke(class org.apache.spark.sql.catalyst.expressions.ExpressionImplUtils, BinaryType, aesEncrypt, cast(g#0 as binary), cast(g#0 as binary), g#0, DEFAULT, cast( as binary), cast( as binary), BinaryType, BinaryType, StringType, StringType, BinaryType, BinaryType, true, true, true) AS aes_encrypt(g, g, g, DEFAULT, , )#0]
Project [staticinvoke(class org.apache.spark.sql.catalyst.expressions.ExpressionImplUtils, BinaryType, aesDecrypt, cast(g#0 as binary), cast(g#0 as binary), g#0, DEFAULT, cast( as binary), BinaryType, BinaryType, StringType, StringType, BinaryType, true, true, true) AS aes_decrypt(g, g, g, DEFAULT, )#0]
+- LocalRelation <empty>, [id#0L, a#0, b#0, d#0, e#0, f#0, g#0]
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
Project [staticinvoke(class org.apache.spark.sql.catalyst.expressions.ExpressionImplUtils, BinaryType, aesEncrypt, cast(g#0 as binary), cast(g#0 as binary), g#0, g#0, cast( as binary), cast( as binary), BinaryType, BinaryType, StringType, StringType, BinaryType, BinaryType, true, true, true) AS aes_encrypt(g, g, g, g, , )#0]
Project [staticinvoke(class org.apache.spark.sql.catalyst.expressions.ExpressionImplUtils, BinaryType, aesDecrypt, cast(g#0 as binary), cast(g#0 as binary), g#0, g#0, cast( as binary), BinaryType, BinaryType, StringType, StringType, BinaryType, true, true, true) AS aes_decrypt(g, g, g, g, )#0]
+- LocalRelation <empty>, [id#0L, a#0, b#0, d#0, e#0, f#0, g#0]
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
Project [staticinvoke(class org.apache.spark.sql.catalyst.expressions.ExpressionImplUtils, BinaryType, aesEncrypt, cast(g#0 as binary), cast(g#0 as binary), g#0, g#0, cast(g#0 as binary), cast( as binary), BinaryType, BinaryType, StringType, StringType, BinaryType, BinaryType, true, true, true) AS aes_encrypt(g, g, g, g, g, )#0]
Project [staticinvoke(class org.apache.spark.sql.catalyst.expressions.ExpressionImplUtils, BinaryType, aesDecrypt, cast(g#0 as binary), cast(g#0 as binary), g#0, g#0, cast(g#0 as binary), BinaryType, BinaryType, StringType, StringType, BinaryType, true, true, true) AS aes_decrypt(g, g, g, g, g)#0]
+- LocalRelation <empty>, [id#0L, a#0, b#0, d#0, e#0, f#0, g#0]
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
Project [LOG(E(), b#0) AS LOG(E(), b)#0]
Project [ln(b#0) AS ln(b)#0]
+- LocalRelation <empty>, [id#0L, a#0, b#0, d#0, e#0, f#0, g#0]
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
},
"expressions": [{
"unresolvedFunction": {
"functionName": "aes_encrypt",
"functionName": "aes_decrypt",
"arguments": [{
"unresolvedAttribute": {
"unparsedIdentifier": "g"
Expand Down
Binary file not shown.
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
},
"expressions": [{
"unresolvedFunction": {
"functionName": "aes_encrypt",
"functionName": "aes_decrypt",
"arguments": [{
"unresolvedAttribute": {
"unparsedIdentifier": "g"
Expand Down
Binary file not shown.
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
},
"expressions": [{
"unresolvedFunction": {
"functionName": "aes_encrypt",
"functionName": "aes_decrypt",
"arguments": [{
"unresolvedAttribute": {
"unparsedIdentifier": "g"
Expand Down
Binary file not shown.
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
},
"expressions": [{
"unresolvedFunction": {
"functionName": "aes_encrypt",
"functionName": "aes_decrypt",
"arguments": [{
"unresolvedAttribute": {
"unparsedIdentifier": "g"
Expand Down
Binary file not shown.
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
},
"expressions": [{
"unresolvedFunction": {
"functionName": "log",
"functionName": "ln",
"arguments": [{
"unresolvedAttribute": {
"unparsedIdentifier": "b"
Expand Down
Binary file not shown.
351 changes: 347 additions & 4 deletions core/src/main/resources/org/apache/spark/ui/static/graphlib-dot.min.js

Large diffs are not rendered by default.

28 changes: 28 additions & 0 deletions docs/_layouts/redirect.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->

<!DOCTYPE html>
<html lang="en-US">
<meta charset="utf-8">
<title>Redirecting&hellip;</title>
<link rel="canonical" href="{{ page.redirect.to }}.html">
<script>location="{{ page.redirect.to }}.html"</script>
<meta http-equiv="refresh" content="0; url={{ page.redirect.to }}.html">
<meta name="robots" content="noindex">
<h1>Redirecting&hellip;</h1>
<a href="{{ page.redirect.to }}.html">Click here if you are not redirected.</a>
</html>
1 change: 1 addition & 0 deletions docs/css/custom.css
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ body {
overflow-wrap: anywhere;
overflow-x: hidden;
padding-top: 80px;
padding-bottom: 20px;
}

a {
Expand Down
4 changes: 2 additions & 2 deletions docs/running-on-mesos.md
Original file line number Diff line number Diff line change
Expand Up @@ -136,7 +136,7 @@ If you want to deploy a Spark Application into a Mesos cluster that is running i
- `LIBPROCESS_SSL_KEY_FILE=pathToKeyFile.key` path to key
- `LIBPROCESS_SSL_CERT_FILE=pathToCRTFile.crt` the certificate file to be used

All options can be found at http://mesos.apache.org/documentation/latest/ssl/
All options can be found at [http://mesos.apache.org/documentation/latest/ssl/](http://mesos.apache.org/documentation/latest/ssl/)

Then submit happens as described in Client mode or Cluster mode below

Expand Down Expand Up @@ -579,7 +579,7 @@ See the [configuration page](configuration.html) for information on Spark config
This only affects docker containers, and must be one of "docker"
or "mesos". Mesos supports two types of
containerizers for docker: the "docker" containerizer, and the preferred
"mesos" containerizer. Read more here: http://mesos.apache.org/documentation/latest/container-image/
"mesos" containerizer. Read more <a href="http://mesos.apache.org/documentation/latest/container-image/">here</a>.
</td>
<td>2.1.0</td>
</tr>
Expand Down
6 changes: 3 additions & 3 deletions docs/running-on-yarn.md
Original file line number Diff line number Diff line change
Expand Up @@ -163,7 +163,7 @@ To use a custom metrics.properties for the application master and executors, upd
Amount of resource to use for the YARN Application Master in client mode.
In cluster mode, use <code>spark.yarn.driver.resource.&lt;resource-type&gt;.amount</code> instead.
Please note that this feature can be used only with YARN 3.0+
For reference, see YARN Resource Model documentation: https://hadoop.apache.org/docs/current/hadoop-yarn/hadoop-yarn-site/ResourceModel.html
For reference, see <a href="https://hadoop.apache.org/docs/current/hadoop-yarn/hadoop-yarn-site/ResourceModel.html">YARN Resource Model documentation</a>
<p/>
Example:
To request GPU resources from YARN, use: <code>spark.yarn.am.resource.yarn.io/gpu.amount</code>
Expand All @@ -185,7 +185,7 @@ To use a custom metrics.properties for the application master and executors, upd
<td>
Amount of resource to use for the YARN Application Master in cluster mode.
Please note that this feature can be used only with YARN 3.0+
For reference, see YARN Resource Model documentation: https://hadoop.apache.org/docs/current/hadoop-yarn/hadoop-yarn-site/ResourceModel.html
For reference, see <a href="https://hadoop.apache.org/docs/current/hadoop-yarn/hadoop-yarn-site/ResourceModel.html">YARN Resource Model documentation</a>
<p/>
Example:
To request GPU resources from YARN, use: <code>spark.yarn.driver.resource.yarn.io/gpu.amount</code>
Expand All @@ -198,7 +198,7 @@ To use a custom metrics.properties for the application master and executors, upd
<td>
Amount of resource to use per executor process.
Please note that this feature can be used only with YARN 3.0+
For reference, see YARN Resource Model documentation: https://hadoop.apache.org/docs/current/hadoop-yarn/hadoop-yarn-site/ResourceModel.html
For reference, see <a href="https://hadoop.apache.org/docs/current/hadoop-yarn/hadoop-yarn-site/ResourceModel.html">YARN Resource Model documentation</a>
<p/>
Example:
To request GPU resources from YARN, use: <code>spark.yarn.executor.resource.yarn.io/gpu.amount</code>
Expand Down
32 changes: 24 additions & 8 deletions python/pyspark/pandas/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -857,10 +857,10 @@ def sum(self, numeric_only: Optional[bool] = True, min_count: int = 0) -> FrameL
... "C": [3, 4, 3, 4], "D": ["a", "a", "b", "a"]})
>>> df.groupby("A").sum().sort_index()
B C
B C D
A
1 1 6
2 1 8
1 1 6 ab
2 1 8 aa
>>> df.groupby("D").sum().sort_index()
A B C
Expand Down Expand Up @@ -900,17 +900,17 @@ def sum(self, numeric_only: Optional[bool] = True, min_count: int = 0) -> FrameL
unsupported = [
col.name
for col in self._agg_columns
if not isinstance(col.spark.data_type, (NumericType, BooleanType))
if not isinstance(col.spark.data_type, (NumericType, BooleanType, StringType))
]
if len(unsupported) > 0:
log_advice(
"GroupBy.sum() can only support numeric and bool columns even if"
"GroupBy.sum() can only support numeric, bool and string columns even if"
f"numeric_only=False, skip unsupported columns: {unsupported}"
)

return self._reduce_for_stat_function(
F.sum,
accepted_spark_types=(NumericType, BooleanType),
accepted_spark_types=(NumericType, BooleanType, StringType),
bool_to_numeric=True,
min_count=min_count,
)
Expand Down Expand Up @@ -3534,7 +3534,21 @@ def _reduce_for_stat_function(
for label in psdf._internal.column_labels:
psser = psdf._psser_for(label)
input_scol = psser._dtype_op.nan_to_null(psser).spark.column
output_scol = sfun(input_scol)
if sfun.__name__ == "sum" and isinstance(
psdf._internal.spark_type_for(label), StringType
):
input_scol_name = psser._internal.data_spark_column_names[0]
# Sort data with natural order column to ensure order of data
sorted_array = F.array_sort(
F.collect_list(F.struct(NATURAL_ORDER_COLUMN_NAME, input_scol))
)

# Using transform to extract strings
output_scol = F.concat_ws(
"", F.transform(sorted_array, lambda x: x.getField(input_scol_name))
)
else:
output_scol = sfun(input_scol)

if min_count > 0:
output_scol = F.when(
Expand Down Expand Up @@ -3591,7 +3605,9 @@ def _prepare_reduce(
):
agg_columns.append(psser)
sdf = self._psdf._internal.spark_frame.select(
*groupkey_scols, *[psser.spark.column for psser in agg_columns]
*groupkey_scols,
*[psser.spark.column for psser in agg_columns],
NATURAL_ORDER_COLUMN_NAME,
)
internal = InternalFrame(
spark_frame=sdf,
Expand Down
6 changes: 0 additions & 6 deletions python/pyspark/pandas/tests/groupby/test_groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,9 +59,6 @@ def test_groupby_simple(self):
},
index=[0, 1, 3, 5, 6, 8, 9, 9, 9],
)
if LooseVersion(pd.__version__) >= LooseVersion("2.0.0"):
# TODO(SPARK-43295): Make DataFrameGroupBy.sum support for string type columns
pdf = pdf[["a", "b", "c", "e"]]
psdf = ps.from_pandas(pdf)

for as_index in [True, False]:
Expand Down Expand Up @@ -180,9 +177,6 @@ def sort(df):
index=[0, 1, 3, 5, 6, 8, 9, 9, 9],
)
psdf = ps.from_pandas(pdf)
if LooseVersion(pd.__version__) >= LooseVersion("2.0.0"):
# TODO(SPARK-43295): Make DataFrameGroupBy.sum support for string type columns
pdf = pdf[[10, 20, 30]]

for as_index in [True, False]:
if as_index:
Expand Down
2 changes: 1 addition & 1 deletion python/pyspark/pandas/tests/groupby/test_stat.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,7 +113,7 @@ def test_basic_stat_funcs(self):
# self._test_stat_func(lambda groupby_obj: groupby_obj.sum(), check_exact=False)
self.assert_eq(
psdf.groupby("A").sum().sort_index(),
pdf.groupby("A").sum(numeric_only=True).sort_index(),
pdf.groupby("A").sum().sort_index(),
check_exact=False,
)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ def sort(df):

self.assert_eq(
sort(psdf1.groupby(psdf2.a, as_index=as_index).sum()),
sort(pdf1.groupby(pdf2.a, as_index=as_index).sum(numeric_only=True)),
sort(pdf1.groupby(pdf2.a, as_index=as_index).sum()),
almost=as_index,
)

Expand All @@ -93,7 +93,7 @@ def test_groupby_multiindex_columns(self):

self.assert_eq(
psdf1.groupby(psdf2[("x", "a")]).sum().sort_index(),
pdf1.groupby(pdf2[("x", "a")]).sum(numeric_only=True).sort_index(),
pdf1.groupby(pdf2[("x", "a")]).sum().sort_index(),
)

self.assert_eq(
Expand All @@ -102,7 +102,7 @@ def test_groupby_multiindex_columns(self):
.sort_values(("y", "c"))
.reset_index(drop=True),
pdf1.groupby(pdf2[("x", "a")], as_index=False)
.sum(numeric_only=True)
.sum()
.sort_values(("y", "c"))
.reset_index(drop=True),
)
Expand Down
Loading

0 comments on commit 46fe32d

Please sign in to comment.