From 0367eb3abfd5c23ed2d55c4fdd538a2daddb78a1 Mon Sep 17 00:00:00 2001 From: panbingkun Date: Fri, 12 Jan 2024 10:30:39 +0800 Subject: [PATCH 1/2] [SPARK-46692][BUILD][PYSPARK] Fix potential issues with environment variable transmission `PYTHON_TO_TEST` --- .github/workflows/build_and_test.yml | 3 ++ python/pyspark/sql/functions/builtin.py | 63 ++++++++++++++++++------- 2 files changed, 49 insertions(+), 17 deletions(-) diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index beeef93fee03b..089a9296bcc4e 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -430,6 +430,7 @@ jobs: distribution: zulu java-version: ${{ matrix.java }} - name: List Python packages (${{ env.PYTHON_TO_TEST }}) + env: ${{ fromJSON(inputs.envs) }} shell: 'script -q -e -c "bash {0}"' run: | for py in $(echo $PYTHON_TO_TEST | tr "," "\n") @@ -467,12 +468,14 @@ jobs: flags: unittests name: PySpark - name: Upload test results to report + env: ${{ fromJSON(inputs.envs) }} if: always() uses: actions/upload-artifact@v3 with: name: test-results-${{ matrix.modules }}--${{ matrix.java }}-${{ inputs.hadoop }}-hive2.3 path: "**/target/test-reports/*.xml" - name: Upload unit tests log files + env: ${{ fromJSON(inputs.envs) }} if: ${{ !success() }} uses: actions/upload-artifact@v3 with: diff --git a/python/pyspark/sql/functions/builtin.py b/python/pyspark/sql/functions/builtin.py index a05ce7b043683..659a6c9f7717e 100644 --- a/python/pyspark/sql/functions/builtin.py +++ b/python/pyspark/sql/functions/builtin.py @@ -18513,46 +18513,75 @@ def aes_encrypt( Examples -------- + + Example 1: + + >>> import pyspark.sql.functions as sf >>> df = spark.createDataFrame([( ... "Spark", "abcdefghijklmnop12345678ABCDEFGH", "GCM", "DEFAULT", ... "000000000000000000000000", "This is an AAD mixed into the input",)], ... ["input", "key", "mode", "padding", "iv", "aad"] ... ) - >>> df.select(base64(aes_encrypt( - ... df.input, df.key, df.mode, df.padding, to_binary(df.iv, lit("hex")), df.aad) - ... ).alias('r')).collect() - [Row(r='AAAAAAAAAAAAAAAAQiYi+sTLm7KD9UcZ2nlRdYDe/PX4')] + >>> df.select(sf.base64(sf.aes_encrypt( + ... df.input, df.key, df.mode, df.padding, sf.to_binary(df.iv, sf.lit("hex")), df.aad) + ... )).show(truncate=False) + +-----------------------------------------------------------------------+ + |base64(aes_encrypt(input, key, mode, padding, to_binary(iv, hex), aad))| + +-----------------------------------------------------------------------+ + |AAAAAAAAAAAAAAAAQiYi+sTLm7KD9UcZ2nlRdYDe/PX4 | + +-----------------------------------------------------------------------+ - >>> df.select(base64(aes_encrypt( - ... df.input, df.key, df.mode, df.padding, to_binary(df.iv, lit("hex"))) - ... ).alias('r')).collect() - [Row(r='AAAAAAAAAAAAAAAAQiYi+sRNYDAOTjdSEcYBFsAWPL1f')] + Example 2: + >>> import pyspark.sql.functions as sf + >>> df = spark.createDataFrame([( + ... "Spark", "abcdefghijklmnop12345678ABCDEFGH", "GCM", "DEFAULT", + ... "000000000000000000000000", "This is an AAD mixed into the input",)], + ... ["input", "key", "mode", "padding", "iv", "aad"] + ... ) + >>> df.select(sf.base64(sf.aes_encrypt( + ... df.input, df.key, df.mode, df.padding, sf.to_binary(df.iv, sf.lit("hex"))) + ... )).show(truncate=False) + +--------------------------------------------------------------------+ + |base64(aes_encrypt(input, key, mode, padding, to_binary(iv, hex), ))| + +--------------------------------------------------------------------+ + |AAAAAAAAAAAAAAAAQiYi+sRNYDAOTjdSEcYBFsAWPL1f | + +--------------------------------------------------------------------+ + + Example 3: + + >>> import pyspark.sql.functions as sf >>> df = spark.createDataFrame([( ... "Spark SQL", "1234567890abcdef", "ECB", "PKCS",)], ... ["input", "key", "mode", "padding"] ... ) - >>> df.select(aes_decrypt(aes_encrypt(df.input, df.key, df.mode, df.padding), - ... df.key, df.mode, df.padding).alias('r') - ... ).collect() + >>> df.select(sf.aes_decrypt(sf.aes_encrypt(df.input, df.key, df.mode, df.padding), + ... df.key, df.mode, df.padding) + ... ).show(truncate=False) [Row(r=bytearray(b'Spark SQL'))] + Example 4: + + >>> import pyspark.sql.functions as sf >>> df = spark.createDataFrame([( ... "Spark SQL", "0000111122223333", "ECB",)], ... ["input", "key", "mode"] ... ) - >>> df.select(aes_decrypt(aes_encrypt(df.input, df.key, df.mode), - ... df.key, df.mode).alias('r') - ... ).collect() + >>> df.select(sf.aes_decrypt(sf.aes_encrypt(df.input, df.key, df.mode), + ... df.key, df.mode) + ... ).show(truncate=False) [Row(r=bytearray(b'Spark SQL'))] + Example 5: + + >>> import pyspark.sql.functions as sf >>> df = spark.createDataFrame([( ... "Spark SQL", "abcdefghijklmnop",)], ... ["input", "key"] ... ) - >>> df.select(aes_decrypt( - ... unbase64(base64(aes_encrypt(df.input, df.key))), df.key - ... ).cast("STRING").alias('r')).collect() + >>> df.select(sf.aes_decrypt( + ... sf.unbase64(sf.base64(sf.aes_encrypt(df.input, df.key))), df.key + ... ).cast("STRING")).show(truncate=False) [Row(r='Spark SQL')] """ _mode = lit("GCM") if mode is None else mode From 76b8a2b523186c297714aeac840f534a216e39dd Mon Sep 17 00:00:00 2001 From: panbingkun Date: Fri, 12 Jan 2024 10:38:29 +0800 Subject: [PATCH 2/2] [SPARK-46692][BUILD][PYSPARK] Fix potential issues with environment variable transmission `PYTHON_TO_TEST` --- python/pyspark/sql/functions/builtin.py | 63 +++++++------------------ 1 file changed, 17 insertions(+), 46 deletions(-) diff --git a/python/pyspark/sql/functions/builtin.py b/python/pyspark/sql/functions/builtin.py index 659a6c9f7717e..a05ce7b043683 100644 --- a/python/pyspark/sql/functions/builtin.py +++ b/python/pyspark/sql/functions/builtin.py @@ -18513,75 +18513,46 @@ def aes_encrypt( Examples -------- - - Example 1: - - >>> import pyspark.sql.functions as sf >>> df = spark.createDataFrame([( ... "Spark", "abcdefghijklmnop12345678ABCDEFGH", "GCM", "DEFAULT", ... "000000000000000000000000", "This is an AAD mixed into the input",)], ... ["input", "key", "mode", "padding", "iv", "aad"] ... ) - >>> df.select(sf.base64(sf.aes_encrypt( - ... df.input, df.key, df.mode, df.padding, sf.to_binary(df.iv, sf.lit("hex")), df.aad) - ... )).show(truncate=False) - +-----------------------------------------------------------------------+ - |base64(aes_encrypt(input, key, mode, padding, to_binary(iv, hex), aad))| - +-----------------------------------------------------------------------+ - |AAAAAAAAAAAAAAAAQiYi+sTLm7KD9UcZ2nlRdYDe/PX4 | - +-----------------------------------------------------------------------+ + >>> df.select(base64(aes_encrypt( + ... df.input, df.key, df.mode, df.padding, to_binary(df.iv, lit("hex")), df.aad) + ... ).alias('r')).collect() + [Row(r='AAAAAAAAAAAAAAAAQiYi+sTLm7KD9UcZ2nlRdYDe/PX4')] - Example 2: + >>> df.select(base64(aes_encrypt( + ... df.input, df.key, df.mode, df.padding, to_binary(df.iv, lit("hex"))) + ... ).alias('r')).collect() + [Row(r='AAAAAAAAAAAAAAAAQiYi+sRNYDAOTjdSEcYBFsAWPL1f')] - >>> import pyspark.sql.functions as sf - >>> df = spark.createDataFrame([( - ... "Spark", "abcdefghijklmnop12345678ABCDEFGH", "GCM", "DEFAULT", - ... "000000000000000000000000", "This is an AAD mixed into the input",)], - ... ["input", "key", "mode", "padding", "iv", "aad"] - ... ) - >>> df.select(sf.base64(sf.aes_encrypt( - ... df.input, df.key, df.mode, df.padding, sf.to_binary(df.iv, sf.lit("hex"))) - ... )).show(truncate=False) - +--------------------------------------------------------------------+ - |base64(aes_encrypt(input, key, mode, padding, to_binary(iv, hex), ))| - +--------------------------------------------------------------------+ - |AAAAAAAAAAAAAAAAQiYi+sRNYDAOTjdSEcYBFsAWPL1f | - +--------------------------------------------------------------------+ - - Example 3: - - >>> import pyspark.sql.functions as sf >>> df = spark.createDataFrame([( ... "Spark SQL", "1234567890abcdef", "ECB", "PKCS",)], ... ["input", "key", "mode", "padding"] ... ) - >>> df.select(sf.aes_decrypt(sf.aes_encrypt(df.input, df.key, df.mode, df.padding), - ... df.key, df.mode, df.padding) - ... ).show(truncate=False) + >>> df.select(aes_decrypt(aes_encrypt(df.input, df.key, df.mode, df.padding), + ... df.key, df.mode, df.padding).alias('r') + ... ).collect() [Row(r=bytearray(b'Spark SQL'))] - Example 4: - - >>> import pyspark.sql.functions as sf >>> df = spark.createDataFrame([( ... "Spark SQL", "0000111122223333", "ECB",)], ... ["input", "key", "mode"] ... ) - >>> df.select(sf.aes_decrypt(sf.aes_encrypt(df.input, df.key, df.mode), - ... df.key, df.mode) - ... ).show(truncate=False) + >>> df.select(aes_decrypt(aes_encrypt(df.input, df.key, df.mode), + ... df.key, df.mode).alias('r') + ... ).collect() [Row(r=bytearray(b'Spark SQL'))] - Example 5: - - >>> import pyspark.sql.functions as sf >>> df = spark.createDataFrame([( ... "Spark SQL", "abcdefghijklmnop",)], ... ["input", "key"] ... ) - >>> df.select(sf.aes_decrypt( - ... sf.unbase64(sf.base64(sf.aes_encrypt(df.input, df.key))), df.key - ... ).cast("STRING")).show(truncate=False) + >>> df.select(aes_decrypt( + ... unbase64(base64(aes_encrypt(df.input, df.key))), df.key + ... ).cast("STRING").alias('r')).collect() [Row(r='Spark SQL')] """ _mode = lit("GCM") if mode is None else mode