diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 00ba16265dce4..4a11823aee604 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -83,15 +83,21 @@ jobs: yarn=`./dev/is-changed.py -m yarn` kubernetes=`./dev/is-changed.py -m kubernetes` sparkr=`./dev/is-changed.py -m sparkr` + tpcds=`./dev/is-changed.py -m sql` + docker=`./dev/is-changed.py -m docker-integration-tests` buf=true ui=true + docs=true else pandas=false yarn=false kubernetes=false sparkr=false + tpcds=false + docker=false buf=false ui=false + docs=false fi build=`./dev/is-changed.py -m "core,unsafe,kvstore,avro,utils,network-common,network-shuffle,repl,launcher,examples,sketch,variant,api,catalyst,hive-thriftserver,mllib-local,mllib,graphx,streaming,sql-kafka-0-10,streaming-kafka-0-10,streaming-kinesis-asl,kubernetes,hadoop-cloud,spark-ganglia-lgpl,protobuf,yarn,connect,sql,hive"` precondition=" @@ -100,9 +106,10 @@ jobs: \"pyspark\": \"$pyspark\", \"pyspark-pandas\": \"$pandas\", \"sparkr\": \"$sparkr\", - \"tpcds-1g\": \"false\", - \"docker-integration-tests\": \"false\", + \"tpcds-1g\": \"$tpcds\", + \"docker-integration-tests\": \"$docker\", \"lint\" : \"true\", + \"docs\" : \"$docs\", \"yarn\" : \"$yarn\", \"k8s-integration-tests\" : \"$kubernetes\", \"buf\" : \"$buf\", @@ -621,12 +628,12 @@ jobs: - name: Python CodeGen check run: ./dev/connect-check-protos.py - # Static analysis, and documentation build + # Static analysis lint: needs: [precondition, infra-image] # always run if lint == 'true', even infra-image is skip (such as non-master job) if: (!cancelled()) && fromJson(needs.precondition.outputs.required).lint == 'true' - name: Linters, licenses, dependencies and documentation generation + name: Linters, licenses, and dependencies runs-on: ubuntu-latest timeout-minutes: 180 env: @@ -764,7 +771,90 @@ jobs: Rscript -e "devtools::install_version('lintr', version='2.0.1', repos='https://cloud.r-project.org')" - name: Install R linter dependencies and SparkR run: ./R/install-dev.sh - # Should delete this section after SPARK 3.5 EOL. + - name: R linter + run: ./dev/lint-r + + # Documentation build + docs: + needs: [precondition, infra-image] + # always run if lint == 'true', even infra-image is skip (such as non-master job) + if: (!cancelled()) && fromJson(needs.precondition.outputs.required).docs == 'true' + name: Documentation generation + runs-on: ubuntu-latest + timeout-minutes: 180 + env: + LC_ALL: C.UTF-8 + LANG: C.UTF-8 + NOLINT_ON_COMPILE: false + PYSPARK_DRIVER_PYTHON: python3.9 + PYSPARK_PYTHON: python3.9 + GITHUB_PREV_SHA: ${{ github.event.before }} + container: + image: ${{ needs.precondition.outputs.image_url }} + steps: + - name: Checkout Spark repository + uses: actions/checkout@v4 + with: + fetch-depth: 0 + repository: apache/spark + ref: ${{ inputs.branch }} + - name: Add GITHUB_WORKSPACE to git trust safe.directory + run: | + git config --global --add safe.directory ${GITHUB_WORKSPACE} + - name: Sync the current branch with the latest in Apache Spark + if: github.repository != 'apache/spark' + run: | + echo "APACHE_SPARK_REF=$(git rev-parse HEAD)" >> $GITHUB_ENV + git fetch https://github.com/$GITHUB_REPOSITORY.git ${GITHUB_REF#refs/heads/} + git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' merge --no-commit --progress --squash FETCH_HEAD + git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' commit -m "Merged commit" --allow-empty + # Cache local repositories. Note that GitHub Actions cache has a 10G limit. + - name: Cache SBT and Maven + uses: actions/cache@v4 + with: + path: | + build/apache-maven-* + build/*.jar + ~/.sbt + key: build-${{ hashFiles('**/pom.xml', 'project/build.properties', 'build/mvn', 'build/sbt', 'build/sbt-launch-lib.bash', 'build/spark-build-info') }} + restore-keys: | + build- + - name: Cache Coursier local repository + uses: actions/cache@v4 + with: + path: ~/.cache/coursier + key: docs-coursier-${{ hashFiles('**/pom.xml', '**/plugins.sbt') }} + restore-keys: | + docs-coursier- + - name: Cache Maven local repository + uses: actions/cache@v4 + with: + path: ~/.m2/repository + key: docs-maven-${{ hashFiles('**/pom.xml') }} + restore-keys: | + docs-maven- + - name: Free up disk space + run: | + if [ -f ./dev/free_disk_space_container ]; then + ./dev/free_disk_space_container + fi + - name: Install Java ${{ inputs.java }} + uses: actions/setup-java@v4 + with: + distribution: zulu + java-version: ${{ inputs.java }} + - name: Install Python dependencies for python linter and documentation generation + if: inputs.branch != 'branch-3.4' && inputs.branch != 'branch-3.5' + run: | + # Should unpin 'sphinxcontrib-*' after upgrading sphinx>5 + # See 'ipython_genutils' in SPARK-38517 + # See 'docutils<0.18.0' in SPARK-39421 + python3.9 -m pip install 'sphinx==4.5.0' mkdocs 'pydata_sphinx_theme>=0.13' sphinx-copybutton nbsphinx numpydoc jinja2 markupsafe 'pyzmq<24.0.0' \ + ipython ipython_genutils sphinx_plotly_directive 'numpy>=1.20.0' pyarrow pandas 'plotly>=4.8' 'docutils<0.18.0' \ + 'flake8==3.9.0' 'mypy==1.8.0' 'pytest==7.1.3' 'pytest-mypy-plugins==1.9.3' 'black==23.9.1' \ + 'pandas-stubs==1.2.0.53' 'grpcio==1.62.0' 'grpc-stubs==1.24.11' 'googleapis-common-protos-stubs==2.2.0' \ + 'sphinxcontrib-applehelp==1.0.4' 'sphinxcontrib-devhelp==1.0.2' 'sphinxcontrib-htmlhelp==2.0.1' 'sphinxcontrib-qthelp==1.0.3' 'sphinxcontrib-serializinghtml==1.1.5' + python3.9 -m pip list - name: Install dependencies for documentation generation for branch-3.4, branch-3.5 if: inputs.branch == 'branch-3.4' || inputs.branch == 'branch-3.5' run: | @@ -785,8 +875,6 @@ jobs: gem install bundler -v 2.4.22 cd docs bundle install - - name: R linter - run: ./dev/lint-r - name: Run documentation build run: | # We need this link because the jekyll build calls `python`. diff --git a/.github/workflows/build_branch34.yml b/.github/workflows/build_branch34.yml index 68887970d4d81..deb6c42407970 100644 --- a/.github/workflows/build_branch34.yml +++ b/.github/workflows/build_branch34.yml @@ -43,7 +43,6 @@ jobs: jobs: >- { "build": "true", - "pyspark": "true", "sparkr": "true", "tpcds-1g": "true", "docker-integration-tests": "true", diff --git a/.github/workflows/build_branch34_python.yml b/.github/workflows/build_branch34_python.yml new file mode 100644 index 0000000000000..c109ba2dc7922 --- /dev/null +++ b/.github/workflows/build_branch34_python.yml @@ -0,0 +1,45 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# + +name: "Build / Python-only (branch-3.4)" + +on: + schedule: + - cron: '0 9 * * *' + +jobs: + run-build: + permissions: + packages: write + name: Run + uses: ./.github/workflows/build_and_test.yml + if: github.repository == 'apache/spark' + with: + java: 8 + branch: branch-3.4 + hadoop: hadoop3 + envs: >- + { + "PYTHON_TO_TEST": "" + } + jobs: >- + { + "pyspark": "true", + "pyspark-pandas": "true" + } diff --git a/.github/workflows/build_branch35.yml b/.github/workflows/build_branch35.yml index 55616c2f1f017..2ec080d5722c1 100644 --- a/.github/workflows/build_branch35.yml +++ b/.github/workflows/build_branch35.yml @@ -43,7 +43,6 @@ jobs: jobs: >- { "build": "true", - "pyspark": "true", "sparkr": "true", "tpcds-1g": "true", "docker-integration-tests": "true", diff --git a/.github/workflows/build_branch35_python.yml b/.github/workflows/build_branch35_python.yml new file mode 100644 index 0000000000000..1585534d33ba9 --- /dev/null +++ b/.github/workflows/build_branch35_python.yml @@ -0,0 +1,45 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# + +name: "Build / Python-only (branch-3.5)" + +on: + schedule: + - cron: '0 11 * * *' + +jobs: + run-build: + permissions: + packages: write + name: Run + uses: ./.github/workflows/build_and_test.yml + if: github.repository == 'apache/spark' + with: + java: 8 + branch: branch-3.5 + hadoop: hadoop3 + envs: >- + { + "PYTHON_TO_TEST": "" + } + jobs: >- + { + "pyspark": "true", + "pyspark-pandas": "true" + } diff --git a/.github/workflows/build_non_ansi.yml b/.github/workflows/build_non_ansi.yml index 9026276903206..4ac2a589f4f81 100644 --- a/.github/workflows/build_non_ansi.yml +++ b/.github/workflows/build_non_ansi.yml @@ -17,7 +17,7 @@ # under the License. # -name: "Build / NON-ANSI (master, Hadoop 3, JDK 17, Scala 2.13)" +name: "Build / Non-ANSI (master, Hadoop 3, JDK 17, Scala 2.13)" on: schedule: @@ -41,6 +41,7 @@ jobs: jobs: >- { "build": "true", + "docs": "true", "pyspark": "true", "sparkr": "true", "tpcds-1g": "true", diff --git a/.github/workflows/build_python.yml b/.github/workflows/build_python_3.10.yml similarity index 63% rename from .github/workflows/build_python.yml rename to .github/workflows/build_python_3.10.yml index efa281d6a279c..5ae37fbc9120e 100644 --- a/.github/workflows/build_python.yml +++ b/.github/workflows/build_python_3.10.yml @@ -17,26 +17,14 @@ # under the License. # -# According to https://infra.apache.org/github-actions-policy.html, -# all workflows SHOULD have a job concurrency level less than or equal to 15. -# To do that, we run one python version per cron schedule -name: "Build / Python-only (master, PyPy 3.9/Python 3.10/Python 3.12)" +name: "Build / Python-only (master, Python 3.10)" on: schedule: - - cron: '0 15 * * *' - cron: '0 17 * * *' - - cron: '0 19 * * *' jobs: run-build: - strategy: - fail-fast: false - matrix: - include: - - pyversion: ${{ github.event.schedule == '0 15 * * *' && 'pypy3' }} - - pyversion: ${{ github.event.schedule == '0 17 * * *' && 'python3.10' }} - - pyversion: ${{ github.event.schedule == '0 19 * * *' && 'python3.12' }} permissions: packages: write name: Run @@ -48,7 +36,7 @@ jobs: hadoop: hadoop3 envs: >- { - "PYTHON_TO_TEST": "${{ matrix.pyversion }}" + "PYTHON_TO_TEST": "python3.10" } jobs: >- { diff --git a/.github/workflows/build_python_3.12.yml b/.github/workflows/build_python_3.12.yml new file mode 100644 index 0000000000000..e1fd45a7d8838 --- /dev/null +++ b/.github/workflows/build_python_3.12.yml @@ -0,0 +1,45 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# + +name: "Build / Python-only (master, Python 3.12)" + +on: + schedule: + - cron: '0 19 * * *' + +jobs: + run-build: + permissions: + packages: write + name: Run + uses: ./.github/workflows/build_and_test.yml + if: github.repository == 'apache/spark' + with: + java: 17 + branch: master + hadoop: hadoop3 + envs: >- + { + "PYTHON_TO_TEST": "python3.12" + } + jobs: >- + { + "pyspark": "true", + "pyspark-pandas": "true" + } diff --git a/.github/workflows/build_python_pypy3.9.yml b/.github/workflows/build_python_pypy3.9.yml new file mode 100644 index 0000000000000..e05071ef034a0 --- /dev/null +++ b/.github/workflows/build_python_pypy3.9.yml @@ -0,0 +1,45 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# + +name: "Build / Python-only (master, PyPy 3.9)" + +on: + schedule: + - cron: '0 15 * * *' + +jobs: + run-build: + permissions: + packages: write + name: Run + uses: ./.github/workflows/build_and_test.yml + if: github.repository == 'apache/spark' + with: + java: 17 + branch: master + hadoop: hadoop3 + envs: >- + { + "PYTHON_TO_TEST": "pypy3" + } + jobs: >- + { + "pyspark": "true", + "pyspark-pandas": "true" + } diff --git a/.github/workflows/cancel_duplicate_workflow_runs.yml b/.github/workflows/cancel_duplicate_workflow_runs.yml index d41ca31190d94..697a3fb277d33 100644 --- a/.github/workflows/cancel_duplicate_workflow_runs.yml +++ b/.github/workflows/cancel_duplicate_workflow_runs.yml @@ -17,7 +17,7 @@ # under the License. # -name: Cancelling Duplicates +name: Cancelling duplicates on: workflow_run: workflows: diff --git a/.github/workflows/publish_snapshot.yml b/.github/workflows/publish_snapshot.yml index 006ccf239e6fe..1b5bd0ba61288 100644 --- a/.github/workflows/publish_snapshot.yml +++ b/.github/workflows/publish_snapshot.yml @@ -17,7 +17,7 @@ # under the License. # -name: Publish Snapshot +name: Publish snapshot on: schedule: diff --git a/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/Dataset.scala b/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/Dataset.scala index 9a42afebf8f2b..37f770319b695 100644 --- a/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/Dataset.scala +++ b/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/Dataset.scala @@ -3337,8 +3337,69 @@ class Dataset[T] private[sql] ( } } + /** + * Define (named) metrics to observe on the Dataset. This method returns an 'observed' Dataset + * that returns the same result as the input, with the following guarantees: Please note that continuous execution is currently not supported. + * + * The metrics columns must either contain a literal (e.g. lit(42)), or should contain one or + * more aggregate functions (e.g. sum(a) or sum(a + b) + avg(c) - lit(1)). Expressions that + * contain references to the input Dataset's columns must always be wrapped in an aggregate + * function. + * + * A user can retrieve the metrics by calling + * `org.apache.spark.sql.Dataset.collectResult().getObservedMetrics`. + * + * {{{ + * // Observe row count (rows) and highest id (maxid) in the Dataset while writing it + * val observed_ds = ds.observe("my_metrics", count(lit(1)).as("rows"), max($"id").as("maxid")) + * observed_ds.write.parquet("ds.parquet") + * val metrics = observed_ds.collectResult().getObservedMetrics + * }}} + * + * @group typedrel + * @since 4.0.0 + */ + @scala.annotation.varargs def observe(name: String, expr: Column, exprs: Column*): Dataset[T] = { - throw new UnsupportedOperationException("observe is not implemented.") + sparkSession.newDataset(agnosticEncoder) { builder => + builder.getCollectMetricsBuilder + .setInput(plan.getRoot) + .setName(name) + .addAllMetrics((expr +: exprs).map(_.expr).asJava) + } + } + + /** + * Observe (named) metrics through an `org.apache.spark.sql.Observation` instance. This is + * equivalent to calling `observe(String, Column, Column*)` but does not require to collect all + * results before returning the metrics - the metrics are filled during iterating the results, + * as soon as they are available. This method does not support streaming datasets. + * + * A user can retrieve the metrics by accessing `org.apache.spark.sql.Observation.get`. + * + * {{{ + * // Observe row count (rows) and highest id (maxid) in the Dataset while writing it + * val observation = Observation("my_metrics") + * val observed_ds = ds.observe(observation, count(lit(1)).as("rows"), max($"id").as("maxid")) + * observed_ds.write.parquet("ds.parquet") + * val metrics = observation.get + * }}} + * + * @throws IllegalArgumentException + * If this is a streaming Dataset (this.isStreaming == true) + * + * @group typedrel + * @since 4.0.0 + */ + @scala.annotation.varargs + def observe(observation: Observation, expr: Column, exprs: Column*): Dataset[T] = { + val df = observe(observation.name, expr, exprs: _*) + sparkSession.registerObservation(df.getPlanId.get, observation) + df } def checkpoint(): Dataset[T] = { diff --git a/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/Observation.scala b/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/Observation.scala new file mode 100644 index 0000000000000..75629b6000f91 --- /dev/null +++ b/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/Observation.scala @@ -0,0 +1,46 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql + +import java.util.UUID + +class Observation(name: String) extends ObservationBase(name) { + + /** + * Create an Observation instance without providing a name. This generates a random name. + */ + def this() = this(UUID.randomUUID().toString) +} + +/** + * (Scala-specific) Create instances of Observation via Scala `apply`. + * @since 4.0.0 + */ +object Observation { + + /** + * Observation constructor for creating an anonymous observation. + */ + def apply(): Observation = new Observation() + + /** + * Observation constructor for creating a named observation. + */ + def apply(name: String): Observation = new Observation(name) + +} diff --git a/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/SparkSession.scala b/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/SparkSession.scala index 22bb62803fac5..1188fba60a2fe 100644 --- a/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/SparkSession.scala +++ b/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/SparkSession.scala @@ -18,6 +18,7 @@ package org.apache.spark.sql import java.io.Closeable import java.net.URI +import java.util.concurrent.ConcurrentHashMap import java.util.concurrent.TimeUnit._ import java.util.concurrent.atomic.{AtomicLong, AtomicReference} @@ -36,7 +37,7 @@ import org.apache.spark.sql.catalog.Catalog import org.apache.spark.sql.catalyst.{JavaTypeInference, ScalaReflection} import org.apache.spark.sql.catalyst.encoders.{AgnosticEncoder, RowEncoder} import org.apache.spark.sql.catalyst.encoders.AgnosticEncoders.{BoxedLongEncoder, UnboundRowEncoder} -import org.apache.spark.sql.connect.client.{ClassFinder, SparkConnectClient, SparkResult} +import org.apache.spark.sql.connect.client.{ClassFinder, CloseableIterator, SparkConnectClient, SparkResult} import org.apache.spark.sql.connect.client.SparkConnectClient.Configuration import org.apache.spark.sql.connect.client.arrow.ArrowSerializer import org.apache.spark.sql.functions.lit @@ -80,6 +81,8 @@ class SparkSession private[sql] ( client.analyze(proto.AnalyzePlanRequest.AnalyzeCase.SPARK_VERSION).getSparkVersion.getVersion } + private[sql] val observationRegistry = new ConcurrentHashMap[Long, Observation]() + /** * Runtime configuration interface for Spark. * @@ -532,8 +535,12 @@ class SparkSession private[sql] ( private[sql] def execute[T](plan: proto.Plan, encoder: AgnosticEncoder[T]): SparkResult[T] = { val value = client.execute(plan) - val result = new SparkResult(value, allocator, encoder, timeZoneId) - result + new SparkResult( + value, + allocator, + encoder, + timeZoneId, + Some(setMetricsAndUnregisterObservation)) } private[sql] def execute(f: proto.Relation.Builder => Unit): Unit = { @@ -554,6 +561,9 @@ class SparkSession private[sql] ( client.execute(plan).filter(!_.hasExecutionProgress).toSeq } + private[sql] def execute(plan: proto.Plan): CloseableIterator[ExecutePlanResponse] = + client.execute(plan) + private[sql] def registerUdf(udf: proto.CommonInlineUserDefinedFunction): Unit = { val command = proto.Command.newBuilder().setRegisterFunction(udf).build() execute(command) @@ -779,6 +789,21 @@ class SparkSession private[sql] ( * Set to false to prevent client.releaseSession on close() (testing only) */ private[sql] var releaseSessionOnClose = true + + private[sql] def registerObservation(planId: Long, observation: Observation): Unit = { + if (observationRegistry.putIfAbsent(planId, observation) != null) { + throw new IllegalArgumentException("An Observation can be used with a Dataset only once") + } + } + + private[sql] def setMetricsAndUnregisterObservation( + planId: Long, + metrics: Map[String, Any]): Unit = { + val observationOrNull = observationRegistry.remove(planId) + if (observationOrNull != null) { + observationOrNull.setMetricsAndNotify(Some(metrics)) + } + } } // The minimal builder needed to create a spark session. diff --git a/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/ClientE2ETestSuite.scala b/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/ClientE2ETestSuite.scala index a0729adb89609..73a2f6d4f88e1 100644 --- a/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/ClientE2ETestSuite.scala +++ b/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/ClientE2ETestSuite.scala @@ -22,6 +22,8 @@ import java.time.DateTimeException import java.util.Properties import scala.collection.mutable +import scala.concurrent.{ExecutionContext, Future} +import scala.concurrent.duration.DurationInt import scala.jdk.CollectionConverters._ import org.apache.commons.io.FileUtils @@ -41,6 +43,7 @@ import org.apache.spark.sql.internal.SqlApiConf import org.apache.spark.sql.test.{IntegrationTestUtils, RemoteSparkSession, SQLHelper} import org.apache.spark.sql.test.SparkConnectServerUtils.port import org.apache.spark.sql.types._ +import org.apache.spark.util.SparkThreadUtils class ClientE2ETestSuite extends RemoteSparkSession with SQLHelper with PrivateMethodTester { @@ -1511,6 +1514,46 @@ class ClientE2ETestSuite extends RemoteSparkSession with SQLHelper with PrivateM (0 until 5).foreach(i => assert(row.get(i * 2) === row.get(i * 2 + 1))) } } + + test("Observable metrics") { + val df = spark.range(99).withColumn("extra", col("id") - 1) + val ob1 = new Observation("ob1") + val observedDf = df.observe(ob1, min("id"), avg("id"), max("id")) + val observedObservedDf = observedDf.observe("ob2", min("extra"), avg("extra"), max("extra")) + + val ob1Schema = new StructType() + .add("min(id)", LongType) + .add("avg(id)", DoubleType) + .add("max(id)", LongType) + val ob2Schema = new StructType() + .add("min(extra)", LongType) + .add("avg(extra)", DoubleType) + .add("max(extra)", LongType) + val ob1Metrics = Map("ob1" -> new GenericRowWithSchema(Array(0, 49, 98), ob1Schema)) + val ob2Metrics = Map("ob2" -> new GenericRowWithSchema(Array(-1, 48, 97), ob2Schema)) + + assert(df.collectResult().getObservedMetrics === Map.empty) + assert(observedDf.collectResult().getObservedMetrics === ob1Metrics) + assert(observedObservedDf.collectResult().getObservedMetrics === ob1Metrics ++ ob2Metrics) + } + + test("Observation.get is blocked until the query is finished") { + val df = spark.range(99).withColumn("extra", col("id") - 1) + val observation = new Observation("ob1") + val observedDf = df.observe(observation, min("id"), avg("id"), max("id")) + + // Start a new thread to get the observation + val future = Future(observation.get)(ExecutionContext.global) + // make sure the thread is blocked right now + val e = intercept[java.util.concurrent.TimeoutException] { + SparkThreadUtils.awaitResult(future, 2.seconds) + } + assert(e.getMessage.contains("Future timed out")) + observedDf.collect() + // make sure the thread is unblocked after the query is finished + val metrics = SparkThreadUtils.awaitResult(future, 2.seconds) + assert(metrics === Map("min(id)" -> 0, "avg(id)" -> 49, "max(id)" -> 98)) + } } private[sql] case class ClassData(a: String, b: Int) diff --git a/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/connect/client/CheckConnectJvmClientCompatibility.scala b/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/connect/client/CheckConnectJvmClientCompatibility.scala index c89dba03ed699..7be5e2ecd1725 100644 --- a/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/connect/client/CheckConnectJvmClientCompatibility.scala +++ b/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/connect/client/CheckConnectJvmClientCompatibility.scala @@ -196,9 +196,6 @@ object CheckConnectJvmClientCompatibility { ProblemFilters.exclude[Problem]("org.apache.spark.sql.Dataset.COL_POS_KEY"), ProblemFilters.exclude[Problem]("org.apache.spark.sql.Dataset.DATASET_ID_KEY"), ProblemFilters.exclude[Problem]("org.apache.spark.sql.Dataset.curId"), - ProblemFilters.exclude[Problem]("org.apache.spark.sql.Dataset.observe"), - ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.Observation"), - ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.Observation$"), ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.ObservationListener"), ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.ObservationListener$"), ProblemFilters.exclude[Problem]("org.apache.spark.sql.Dataset.queryExecution"), diff --git a/connector/connect/common/src/main/protobuf/spark/connect/base.proto b/connector/connect/common/src/main/protobuf/spark/connect/base.proto index 49a33d3419b6f..77dda277602ab 100644 --- a/connector/connect/common/src/main/protobuf/spark/connect/base.proto +++ b/connector/connect/common/src/main/protobuf/spark/connect/base.proto @@ -434,6 +434,7 @@ message ExecutePlanResponse { string name = 1; repeated Expression.Literal values = 2; repeated string keys = 3; + int64 plan_id = 4; } message ResultComplete { diff --git a/connector/connect/common/src/main/scala/org/apache/spark/sql/connect/client/SparkResult.scala b/connector/connect/common/src/main/scala/org/apache/spark/sql/connect/client/SparkResult.scala index 93d1075aea025..0905ee76c3f34 100644 --- a/connector/connect/common/src/main/scala/org/apache/spark/sql/connect/client/SparkResult.scala +++ b/connector/connect/common/src/main/scala/org/apache/spark/sql/connect/client/SparkResult.scala @@ -27,10 +27,13 @@ import org.apache.arrow.vector.ipc.message.{ArrowMessage, ArrowRecordBatch} import org.apache.arrow.vector.types.pojo import org.apache.spark.connect.proto +import org.apache.spark.connect.proto.ExecutePlanResponse.ObservedMetrics +import org.apache.spark.sql.Row import org.apache.spark.sql.catalyst.encoders.{AgnosticEncoder, RowEncoder} import org.apache.spark.sql.catalyst.encoders.AgnosticEncoders.{ProductEncoder, UnboundRowEncoder} +import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema import org.apache.spark.sql.connect.client.arrow.{AbstractMessageIterator, ArrowDeserializingIterator, ConcatenatingArrowStreamReader, MessageIterator} -import org.apache.spark.sql.connect.common.DataTypeProtoConverter +import org.apache.spark.sql.connect.common.{DataTypeProtoConverter, LiteralValueProtoConverter} import org.apache.spark.sql.types.{DataType, StructType} import org.apache.spark.sql.util.ArrowUtils @@ -38,7 +41,8 @@ private[sql] class SparkResult[T]( responses: CloseableIterator[proto.ExecutePlanResponse], allocator: BufferAllocator, encoder: AgnosticEncoder[T], - timeZoneId: String) + timeZoneId: String, + setObservationMetricsOpt: Option[(Long, Map[String, Any]) => Unit] = None) extends AutoCloseable { self => case class StageInfo( @@ -79,6 +83,7 @@ private[sql] class SparkResult[T]( private[this] var arrowSchema: pojo.Schema = _ private[this] var nextResultIndex: Int = 0 private val resultMap = mutable.Map.empty[Int, (Long, Seq[ArrowMessage])] + private val observedMetrics = mutable.Map.empty[String, Row] private val cleanable = SparkResult.cleaner.register(this, new SparkResultCloseable(resultMap, responses)) @@ -117,6 +122,9 @@ private[sql] class SparkResult[T]( while (!stop && responses.hasNext) { val response = responses.next() + // Collect metrics for this response + observedMetrics ++= processObservedMetrics(response.getObservedMetricsList) + // Save and validate operationId if (opId == null) { opId = response.getOperationId @@ -198,6 +206,29 @@ private[sql] class SparkResult[T]( nonEmpty } + private def processObservedMetrics( + metrics: java.util.List[ObservedMetrics]): Iterable[(String, Row)] = { + metrics.asScala.map { metric => + assert(metric.getKeysCount == metric.getValuesCount) + var schema = new StructType() + val keys = mutable.ListBuffer.empty[String] + val values = mutable.ListBuffer.empty[Any] + (0 until metric.getKeysCount).map { i => + val key = metric.getKeys(i) + val value = LiteralValueProtoConverter.toCatalystValue(metric.getValues(i)) + schema = schema.add(key, LiteralValueProtoConverter.toDataType(value.getClass)) + keys += key + values += value + } + // If the metrics is registered by an Observation object, attach them and unblock any + // blocked thread. + setObservationMetricsOpt.foreach { setObservationMetrics => + setObservationMetrics(metric.getPlanId, keys.zip(values).toMap) + } + metric.getName -> new GenericRowWithSchema(values.toArray, schema) + } + } + /** * Returns the number of elements in the result. */ @@ -248,6 +279,15 @@ private[sql] class SparkResult[T]( result } + /** + * Returns all observed metrics in the result. + */ + def getObservedMetrics: Map[String, Row] = { + // We need to process all responses to get all metrics. + processResponses() + observedMetrics.toMap + } + /** * Returns an iterator over the contents of the result. */ diff --git a/connector/connect/common/src/main/scala/org/apache/spark/sql/connect/common/LiteralValueProtoConverter.scala b/connector/connect/common/src/main/scala/org/apache/spark/sql/connect/common/LiteralValueProtoConverter.scala index ce42cc797bf38..1f3496fa89847 100644 --- a/connector/connect/common/src/main/scala/org/apache/spark/sql/connect/common/LiteralValueProtoConverter.scala +++ b/connector/connect/common/src/main/scala/org/apache/spark/sql/connect/common/LiteralValueProtoConverter.scala @@ -204,7 +204,7 @@ object LiteralValueProtoConverter { def toLiteralProto(literal: Any, dataType: DataType): proto.Expression.Literal = toLiteralProtoBuilder(literal, dataType).build() - private def toDataType(clz: Class[_]): DataType = clz match { + private[sql] def toDataType(clz: Class[_]): DataType = clz match { // primitive types case JShort.TYPE => ShortType case JInteger.TYPE => IntegerType diff --git a/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/execution/ExecuteThreadRunner.scala b/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/execution/ExecuteThreadRunner.scala index 0a6d12cbb1918..4ef4f632204b3 100644 --- a/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/execution/ExecuteThreadRunner.scala +++ b/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/execution/ExecuteThreadRunner.scala @@ -220,6 +220,7 @@ private[connect] class ExecuteThreadRunner(executeHolder: ExecuteHolder) extends .createObservedMetricsResponse( executeHolder.sessionHolder.sessionId, executeHolder.sessionHolder.serverSessionId, + executeHolder.request.getPlan.getRoot.getCommon.getPlanId, observedMetrics ++ accumulatedInPython)) } diff --git a/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/execution/SparkConnectPlanExecution.scala b/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/execution/SparkConnectPlanExecution.scala index 4f2b8c945127b..660951f229849 100644 --- a/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/execution/SparkConnectPlanExecution.scala +++ b/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/execution/SparkConnectPlanExecution.scala @@ -264,8 +264,14 @@ private[execution] class SparkConnectPlanExecution(executeHolder: ExecuteHolder) name -> values } if (observedMetrics.nonEmpty) { - Some(SparkConnectPlanExecution - .createObservedMetricsResponse(sessionId, sessionHolder.serverSessionId, observedMetrics)) + val planId = executeHolder.request.getPlan.getRoot.getCommon.getPlanId + Some( + SparkConnectPlanExecution + .createObservedMetricsResponse( + sessionId, + sessionHolder.serverSessionId, + planId, + observedMetrics)) } else None } } @@ -274,11 +280,13 @@ object SparkConnectPlanExecution { def createObservedMetricsResponse( sessionId: String, serverSessionId: String, + planId: Long, metrics: Map[String, Seq[(Option[String], Any)]]): ExecutePlanResponse = { val observedMetrics = metrics.map { case (name, values) => val metrics = ExecutePlanResponse.ObservedMetrics .newBuilder() .setName(name) + .setPlanId(planId) values.foreach { case (key, value) => metrics.addValues(toLiteralProto(value)) key.foreach(metrics.addKeys) diff --git a/connector/connect/server/src/test/scala/org/apache/spark/sql/connect/planner/SparkConnectServiceSuite.scala b/connector/connect/server/src/test/scala/org/apache/spark/sql/connect/planner/SparkConnectServiceSuite.scala index 59d9750c0fbf4..af18fca9dd216 100644 --- a/connector/connect/server/src/test/scala/org/apache/spark/sql/connect/planner/SparkConnectServiceSuite.scala +++ b/connector/connect/server/src/test/scala/org/apache/spark/sql/connect/planner/SparkConnectServiceSuite.scala @@ -418,14 +418,11 @@ class SparkConnectServiceSuite .setInput( proto.Relation.newBuilder().setSql(proto.SQL.newBuilder().setQuery("select 1")))), None), - // TODO(SPARK-48164) Reenable `commands send events - get_resources_command` - /* ( proto.Command .newBuilder() .setGetResourcesCommand(proto.GetResourcesCommand.newBuilder()), None), - */ ( proto.Command .newBuilder() diff --git a/core/src/main/scala/org/apache/spark/internal/config/ConfigEntry.scala b/core/src/main/scala/org/apache/spark/internal/config/ConfigEntry.scala index c07f2528ee709..a295ef06a6376 100644 --- a/core/src/main/scala/org/apache/spark/internal/config/ConfigEntry.scala +++ b/core/src/main/scala/org/apache/spark/internal/config/ConfigEntry.scala @@ -227,7 +227,7 @@ private[spark] class OptionalConfigEntry[T]( prependSeparator, alternatives, s => Some(rawValueConverter(s)), - v => v.map(rawStringConverter).getOrElse(ConfigEntry.UNDEFINED), + v => v.map(rawStringConverter).orNull, doc, isPublic, version diff --git a/dev/create-release/release-build.sh b/dev/create-release/release-build.sh index 75ec98464f3ec..b720a8fc93861 100755 --- a/dev/create-release/release-build.sh +++ b/dev/create-release/release-build.sh @@ -194,6 +194,8 @@ fi PUBLISH_SCALA_2_12=1 if [[ $SPARK_VERSION > "3.5.99" ]]; then PUBLISH_SCALA_2_12=0 + # There is no longer scala-2.13 profile since 4.0.0 + SCALA_2_13_PROFILES="" fi SCALA_2_12_PROFILES="-Pscala-2.12" @@ -345,21 +347,25 @@ if [[ "$1" == "package" ]]; then declare -A BINARY_PKGS_EXTRA BINARY_PKGS_EXTRA["hadoop3"]="withpip,withr" - if [[ $PUBLISH_SCALA_2_13 = 1 ]]; then - key="hadoop3-scala2.13" + # This is dead code as Scala 2.12 is no longer supported, but we keep it as a template for + # adding new Scala version support in the future. This secondary Scala version only has one + # binary package to avoid doubling the number of final packages. It doesn't build PySpark and + # SparkR as the primary Scala version will build them. + if [[ $PUBLISH_SCALA_2_12 = 1 ]]; then + key="hadoop3-scala2.12" args="-Phadoop-3 $HIVE_PROFILES" extra="" - if ! make_binary_release "$key" "$SCALA_2_13_PROFILES $args" "$extra" "2.13"; then + if ! make_binary_release "$key" "$SCALA_2_12_PROFILES $args" "$extra" "2.12"; then error "Failed to build $key package. Check logs for details." fi fi - if [[ $PUBLISH_SCALA_2_12 = 1 ]]; then + if [[ $PUBLISH_SCALA_2_13 = 1 ]]; then echo "Packages to build: ${!BINARY_PKGS_ARGS[@]}" for key in ${!BINARY_PKGS_ARGS[@]}; do args=${BINARY_PKGS_ARGS[$key]} extra=${BINARY_PKGS_EXTRA[$key]} - if ! make_binary_release "$key" "$SCALA_2_12_PROFILES $args" "$extra" "2.12"; then + if ! make_binary_release "$key" "$SCALA_2_13_PROFILES $args" "$extra" "2.13"; then error "Failed to build $key package. Check logs for details." fi done diff --git a/dev/deps/spark-deps-hadoop-3-hive-2.3 b/dev/deps/spark-deps-hadoop-3-hive-2.3 index 5d933e34e40ba..73d41e9eeb337 100644 --- a/dev/deps/spark-deps-hadoop-3-hive-2.3 +++ b/dev/deps/spark-deps-hadoop-3-hive-2.3 @@ -104,15 +104,15 @@ icu4j/72.1//icu4j-72.1.jar ini4j/0.5.4//ini4j-0.5.4.jar istack-commons-runtime/3.0.8//istack-commons-runtime-3.0.8.jar ivy/2.5.2//ivy-2.5.2.jar -jackson-annotations/2.17.0//jackson-annotations-2.17.0.jar +jackson-annotations/2.17.1//jackson-annotations-2.17.1.jar jackson-core-asl/1.9.13//jackson-core-asl-1.9.13.jar -jackson-core/2.17.0//jackson-core-2.17.0.jar -jackson-databind/2.17.0//jackson-databind-2.17.0.jar -jackson-dataformat-cbor/2.17.0//jackson-dataformat-cbor-2.17.0.jar -jackson-dataformat-yaml/2.17.0//jackson-dataformat-yaml-2.17.0.jar -jackson-datatype-jsr310/2.17.0//jackson-datatype-jsr310-2.17.0.jar +jackson-core/2.17.1//jackson-core-2.17.1.jar +jackson-databind/2.17.1//jackson-databind-2.17.1.jar +jackson-dataformat-cbor/2.17.1//jackson-dataformat-cbor-2.17.1.jar +jackson-dataformat-yaml/2.17.1//jackson-dataformat-yaml-2.17.1.jar +jackson-datatype-jsr310/2.17.1//jackson-datatype-jsr310-2.17.1.jar jackson-mapper-asl/1.9.13//jackson-mapper-asl-1.9.13.jar -jackson-module-scala_2.13/2.17.0//jackson-module-scala_2.13-2.17.0.jar +jackson-module-scala_2.13/2.17.1//jackson-module-scala_2.13-2.17.1.jar jakarta.annotation-api/2.0.0//jakarta.annotation-api-2.0.0.jar jakarta.inject-api/2.0.1//jakarta.inject-api-2.0.1.jar jakarta.servlet-api/5.0.0//jakarta.servlet-api-5.0.0.jar diff --git a/docs/sql-migration-guide.md b/docs/sql-migration-guide.md index fa49d6402b180..bd6604cb69c0f 100644 --- a/docs/sql-migration-guide.md +++ b/docs/sql-migration-guide.md @@ -32,7 +32,7 @@ license: | - Since Spark 4.0, `spark.sql.hive.metastore` drops the support of Hive prior to 2.0.0 as they require JDK 8 that Spark does not support anymore. Users should migrate to higher versions. - Since Spark 4.0, `spark.sql.parquet.compression.codec` drops the support of codec name `lz4raw`, please use `lz4_raw` instead. - Since Spark 4.0, when overflowing during casting timestamp to byte/short/int under non-ansi mode, Spark will return null instead a wrapping value. -- Since Spark 4.0, the `encode()` and `decode()` functions support only the following charsets 'US-ASCII', 'ISO-8859-1', 'UTF-8', 'UTF-16BE', 'UTF-16LE', 'UTF-16'. To restore the previous behavior when the function accepts charsets of the current JDK used by Spark, set `spark.sql.legacy.javaCharsets` to `true`. +- Since Spark 4.0, the `encode()` and `decode()` functions support only the following charsets 'US-ASCII', 'ISO-8859-1', 'UTF-8', 'UTF-16BE', 'UTF-16LE', 'UTF-16', 'UTF-32'. To restore the previous behavior when the function accepts charsets of the current JDK used by Spark, set `spark.sql.legacy.javaCharsets` to `true`. - Since Spark 4.0, the legacy datetime rebasing SQL configs with the prefix `spark.sql.legacy` are removed. To restore the previous behavior, use the following configs: - `spark.sql.parquet.int96RebaseModeInWrite` instead of `spark.sql.legacy.parquet.int96RebaseModeInWrite` - `spark.sql.parquet.datetimeRebaseModeInWrite` instead of `spark.sql.legacy.parquet.datetimeRebaseModeInWrite` diff --git a/pom.xml b/pom.xml index f6f11d94cce32..c3ff5d101c224 100644 --- a/pom.xml +++ b/pom.xml @@ -183,8 +183,8 @@ true true 1.9.13 - 2.17.0 - 2.17.0 + 2.17.1 + 2.17.1 2.3.1 3.0.2 1.1.10.5 @@ -3384,6 +3384,9 @@ org.apache.maven.plugins maven-deploy-plugin 3.1.2 + + 3 + org.apache.maven.plugins diff --git a/python/pyspark/pandas/internal.py b/python/pyspark/pandas/internal.py index 767ec9a57f9b5..8ab8d79d56868 100644 --- a/python/pyspark/pandas/internal.py +++ b/python/pyspark/pandas/internal.py @@ -33,6 +33,7 @@ Window, ) from pyspark.sql.types import ( # noqa: F401 + _drop_metadata, BooleanType, DataType, LongType, @@ -761,14 +762,8 @@ def __init__( # in a few tests when using Spark Connect. However, the function works properly. # Therefore, we temporarily perform Spark Connect tests by excluding metadata # until the issue is resolved. - def remove_metadata(struct_field: StructField) -> StructField: - new_struct_field = StructField( - struct_field.name, struct_field.dataType, struct_field.nullable - ) - return new_struct_field - assert all( - remove_metadata(index_field.struct_field) == remove_metadata(struct_field) + _drop_metadata(index_field.struct_field) == _drop_metadata(struct_field) for index_field, struct_field in zip(index_fields, struct_fields) ), (index_fields, struct_fields) else: @@ -795,14 +790,8 @@ def remove_metadata(struct_field: StructField) -> StructField: # in a few tests when using Spark Connect. However, the function works properly. # Therefore, we temporarily perform Spark Connect tests by excluding metadata # until the issue is resolved. - def remove_metadata(struct_field: StructField) -> StructField: - new_struct_field = StructField( - struct_field.name, struct_field.dataType, struct_field.nullable - ) - return new_struct_field - assert all( - remove_metadata(data_field.struct_field) == remove_metadata(struct_field) + _drop_metadata(data_field.struct_field) == _drop_metadata(struct_field) for data_field, struct_field in zip(data_fields, struct_fields) ), (data_fields, struct_fields) else: diff --git a/python/pyspark/sql/connect/dataframe.py b/python/pyspark/sql/connect/dataframe.py index f9a209d2bcb3d..843c92a9b27d2 100644 --- a/python/pyspark/sql/connect/dataframe.py +++ b/python/pyspark/sql/connect/dataframe.py @@ -813,7 +813,7 @@ def sample( if withReplacement is None: withReplacement = False - seed = int(seed) if seed is not None else None + seed = int(seed) if seed is not None else random.randint(0, sys.maxsize) return DataFrame( plan.Sample( diff --git a/python/pyspark/sql/connect/proto/base_pb2.py b/python/pyspark/sql/connect/proto/base_pb2.py index 2a30ffe60a9f2..a39396db4ff1d 100644 --- a/python/pyspark/sql/connect/proto/base_pb2.py +++ b/python/pyspark/sql/connect/proto/base_pb2.py @@ -37,7 +37,7 @@ DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile( - b'\n\x18spark/connect/base.proto\x12\rspark.connect\x1a\x19google/protobuf/any.proto\x1a\x1cspark/connect/commands.proto\x1a\x1aspark/connect/common.proto\x1a\x1fspark/connect/expressions.proto\x1a\x1dspark/connect/relations.proto\x1a\x19spark/connect/types.proto"t\n\x04Plan\x12-\n\x04root\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationH\x00R\x04root\x12\x32\n\x07\x63ommand\x18\x02 \x01(\x0b\x32\x16.spark.connect.CommandH\x00R\x07\x63ommandB\t\n\x07op_type"z\n\x0bUserContext\x12\x17\n\x07user_id\x18\x01 \x01(\tR\x06userId\x12\x1b\n\tuser_name\x18\x02 \x01(\tR\x08userName\x12\x35\n\nextensions\x18\xe7\x07 \x03(\x0b\x32\x14.google.protobuf.AnyR\nextensions"\xf8\x13\n\x12\x41nalyzePlanRequest\x12\x1d\n\nsession_id\x18\x01 \x01(\tR\tsessionId\x12V\n&client_observed_server_side_session_id\x18\x11 \x01(\tH\x01R!clientObservedServerSideSessionId\x88\x01\x01\x12=\n\x0cuser_context\x18\x02 \x01(\x0b\x32\x1a.spark.connect.UserContextR\x0buserContext\x12$\n\x0b\x63lient_type\x18\x03 \x01(\tH\x02R\nclientType\x88\x01\x01\x12\x42\n\x06schema\x18\x04 \x01(\x0b\x32(.spark.connect.AnalyzePlanRequest.SchemaH\x00R\x06schema\x12\x45\n\x07\x65xplain\x18\x05 \x01(\x0b\x32).spark.connect.AnalyzePlanRequest.ExplainH\x00R\x07\x65xplain\x12O\n\x0btree_string\x18\x06 \x01(\x0b\x32,.spark.connect.AnalyzePlanRequest.TreeStringH\x00R\ntreeString\x12\x46\n\x08is_local\x18\x07 \x01(\x0b\x32).spark.connect.AnalyzePlanRequest.IsLocalH\x00R\x07isLocal\x12R\n\x0cis_streaming\x18\x08 \x01(\x0b\x32-.spark.connect.AnalyzePlanRequest.IsStreamingH\x00R\x0bisStreaming\x12O\n\x0binput_files\x18\t \x01(\x0b\x32,.spark.connect.AnalyzePlanRequest.InputFilesH\x00R\ninputFiles\x12U\n\rspark_version\x18\n \x01(\x0b\x32..spark.connect.AnalyzePlanRequest.SparkVersionH\x00R\x0csparkVersion\x12I\n\tddl_parse\x18\x0b \x01(\x0b\x32*.spark.connect.AnalyzePlanRequest.DDLParseH\x00R\x08\x64\x64lParse\x12X\n\x0esame_semantics\x18\x0c \x01(\x0b\x32/.spark.connect.AnalyzePlanRequest.SameSemanticsH\x00R\rsameSemantics\x12U\n\rsemantic_hash\x18\r \x01(\x0b\x32..spark.connect.AnalyzePlanRequest.SemanticHashH\x00R\x0csemanticHash\x12\x45\n\x07persist\x18\x0e \x01(\x0b\x32).spark.connect.AnalyzePlanRequest.PersistH\x00R\x07persist\x12K\n\tunpersist\x18\x0f \x01(\x0b\x32+.spark.connect.AnalyzePlanRequest.UnpersistH\x00R\tunpersist\x12_\n\x11get_storage_level\x18\x10 \x01(\x0b\x32\x31.spark.connect.AnalyzePlanRequest.GetStorageLevelH\x00R\x0fgetStorageLevel\x1a\x31\n\x06Schema\x12\'\n\x04plan\x18\x01 \x01(\x0b\x32\x13.spark.connect.PlanR\x04plan\x1a\xbb\x02\n\x07\x45xplain\x12\'\n\x04plan\x18\x01 \x01(\x0b\x32\x13.spark.connect.PlanR\x04plan\x12X\n\x0c\x65xplain_mode\x18\x02 \x01(\x0e\x32\x35.spark.connect.AnalyzePlanRequest.Explain.ExplainModeR\x0b\x65xplainMode"\xac\x01\n\x0b\x45xplainMode\x12\x1c\n\x18\x45XPLAIN_MODE_UNSPECIFIED\x10\x00\x12\x17\n\x13\x45XPLAIN_MODE_SIMPLE\x10\x01\x12\x19\n\x15\x45XPLAIN_MODE_EXTENDED\x10\x02\x12\x18\n\x14\x45XPLAIN_MODE_CODEGEN\x10\x03\x12\x15\n\x11\x45XPLAIN_MODE_COST\x10\x04\x12\x1a\n\x16\x45XPLAIN_MODE_FORMATTED\x10\x05\x1aZ\n\nTreeString\x12\'\n\x04plan\x18\x01 \x01(\x0b\x32\x13.spark.connect.PlanR\x04plan\x12\x19\n\x05level\x18\x02 \x01(\x05H\x00R\x05level\x88\x01\x01\x42\x08\n\x06_level\x1a\x32\n\x07IsLocal\x12\'\n\x04plan\x18\x01 \x01(\x0b\x32\x13.spark.connect.PlanR\x04plan\x1a\x36\n\x0bIsStreaming\x12\'\n\x04plan\x18\x01 \x01(\x0b\x32\x13.spark.connect.PlanR\x04plan\x1a\x35\n\nInputFiles\x12\'\n\x04plan\x18\x01 \x01(\x0b\x32\x13.spark.connect.PlanR\x04plan\x1a\x0e\n\x0cSparkVersion\x1a)\n\x08\x44\x44LParse\x12\x1d\n\nddl_string\x18\x01 \x01(\tR\tddlString\x1ay\n\rSameSemantics\x12\x34\n\x0btarget_plan\x18\x01 \x01(\x0b\x32\x13.spark.connect.PlanR\ntargetPlan\x12\x32\n\nother_plan\x18\x02 \x01(\x0b\x32\x13.spark.connect.PlanR\totherPlan\x1a\x37\n\x0cSemanticHash\x12\'\n\x04plan\x18\x01 \x01(\x0b\x32\x13.spark.connect.PlanR\x04plan\x1a\x97\x01\n\x07Persist\x12\x33\n\x08relation\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x08relation\x12\x45\n\rstorage_level\x18\x02 \x01(\x0b\x32\x1b.spark.connect.StorageLevelH\x00R\x0cstorageLevel\x88\x01\x01\x42\x10\n\x0e_storage_level\x1an\n\tUnpersist\x12\x33\n\x08relation\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x08relation\x12\x1f\n\x08\x62locking\x18\x02 \x01(\x08H\x00R\x08\x62locking\x88\x01\x01\x42\x0b\n\t_blocking\x1a\x46\n\x0fGetStorageLevel\x12\x33\n\x08relation\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x08relationB\t\n\x07\x61nalyzeB)\n\'_client_observed_server_side_session_idB\x0e\n\x0c_client_type"\xce\r\n\x13\x41nalyzePlanResponse\x12\x1d\n\nsession_id\x18\x01 \x01(\tR\tsessionId\x12\x33\n\x16server_side_session_id\x18\x0f \x01(\tR\x13serverSideSessionId\x12\x43\n\x06schema\x18\x02 \x01(\x0b\x32).spark.connect.AnalyzePlanResponse.SchemaH\x00R\x06schema\x12\x46\n\x07\x65xplain\x18\x03 \x01(\x0b\x32*.spark.connect.AnalyzePlanResponse.ExplainH\x00R\x07\x65xplain\x12P\n\x0btree_string\x18\x04 \x01(\x0b\x32-.spark.connect.AnalyzePlanResponse.TreeStringH\x00R\ntreeString\x12G\n\x08is_local\x18\x05 \x01(\x0b\x32*.spark.connect.AnalyzePlanResponse.IsLocalH\x00R\x07isLocal\x12S\n\x0cis_streaming\x18\x06 \x01(\x0b\x32..spark.connect.AnalyzePlanResponse.IsStreamingH\x00R\x0bisStreaming\x12P\n\x0binput_files\x18\x07 \x01(\x0b\x32-.spark.connect.AnalyzePlanResponse.InputFilesH\x00R\ninputFiles\x12V\n\rspark_version\x18\x08 \x01(\x0b\x32/.spark.connect.AnalyzePlanResponse.SparkVersionH\x00R\x0csparkVersion\x12J\n\tddl_parse\x18\t \x01(\x0b\x32+.spark.connect.AnalyzePlanResponse.DDLParseH\x00R\x08\x64\x64lParse\x12Y\n\x0esame_semantics\x18\n \x01(\x0b\x32\x30.spark.connect.AnalyzePlanResponse.SameSemanticsH\x00R\rsameSemantics\x12V\n\rsemantic_hash\x18\x0b \x01(\x0b\x32/.spark.connect.AnalyzePlanResponse.SemanticHashH\x00R\x0csemanticHash\x12\x46\n\x07persist\x18\x0c \x01(\x0b\x32*.spark.connect.AnalyzePlanResponse.PersistH\x00R\x07persist\x12L\n\tunpersist\x18\r \x01(\x0b\x32,.spark.connect.AnalyzePlanResponse.UnpersistH\x00R\tunpersist\x12`\n\x11get_storage_level\x18\x0e \x01(\x0b\x32\x32.spark.connect.AnalyzePlanResponse.GetStorageLevelH\x00R\x0fgetStorageLevel\x1a\x39\n\x06Schema\x12/\n\x06schema\x18\x01 \x01(\x0b\x32\x17.spark.connect.DataTypeR\x06schema\x1a\x30\n\x07\x45xplain\x12%\n\x0e\x65xplain_string\x18\x01 \x01(\tR\rexplainString\x1a-\n\nTreeString\x12\x1f\n\x0btree_string\x18\x01 \x01(\tR\ntreeString\x1a$\n\x07IsLocal\x12\x19\n\x08is_local\x18\x01 \x01(\x08R\x07isLocal\x1a\x30\n\x0bIsStreaming\x12!\n\x0cis_streaming\x18\x01 \x01(\x08R\x0bisStreaming\x1a"\n\nInputFiles\x12\x14\n\x05\x66iles\x18\x01 \x03(\tR\x05\x66iles\x1a(\n\x0cSparkVersion\x12\x18\n\x07version\x18\x01 \x01(\tR\x07version\x1a;\n\x08\x44\x44LParse\x12/\n\x06parsed\x18\x01 \x01(\x0b\x32\x17.spark.connect.DataTypeR\x06parsed\x1a\'\n\rSameSemantics\x12\x16\n\x06result\x18\x01 \x01(\x08R\x06result\x1a&\n\x0cSemanticHash\x12\x16\n\x06result\x18\x01 \x01(\x05R\x06result\x1a\t\n\x07Persist\x1a\x0b\n\tUnpersist\x1aS\n\x0fGetStorageLevel\x12@\n\rstorage_level\x18\x01 \x01(\x0b\x32\x1b.spark.connect.StorageLevelR\x0cstorageLevelB\x08\n\x06result"\xa3\x05\n\x12\x45xecutePlanRequest\x12\x1d\n\nsession_id\x18\x01 \x01(\tR\tsessionId\x12V\n&client_observed_server_side_session_id\x18\x08 \x01(\tH\x00R!clientObservedServerSideSessionId\x88\x01\x01\x12=\n\x0cuser_context\x18\x02 \x01(\x0b\x32\x1a.spark.connect.UserContextR\x0buserContext\x12&\n\x0coperation_id\x18\x06 \x01(\tH\x01R\x0boperationId\x88\x01\x01\x12\'\n\x04plan\x18\x03 \x01(\x0b\x32\x13.spark.connect.PlanR\x04plan\x12$\n\x0b\x63lient_type\x18\x04 \x01(\tH\x02R\nclientType\x88\x01\x01\x12X\n\x0frequest_options\x18\x05 \x03(\x0b\x32/.spark.connect.ExecutePlanRequest.RequestOptionR\x0erequestOptions\x12\x12\n\x04tags\x18\x07 \x03(\tR\x04tags\x1a\xa5\x01\n\rRequestOption\x12K\n\x10reattach_options\x18\x01 \x01(\x0b\x32\x1e.spark.connect.ReattachOptionsH\x00R\x0freattachOptions\x12\x35\n\textension\x18\xe7\x07 \x01(\x0b\x32\x14.google.protobuf.AnyH\x00R\textensionB\x10\n\x0erequest_optionB)\n\'_client_observed_server_side_session_idB\x0f\n\r_operation_idB\x0e\n\x0c_client_type"\xe6\x15\n\x13\x45xecutePlanResponse\x12\x1d\n\nsession_id\x18\x01 \x01(\tR\tsessionId\x12\x33\n\x16server_side_session_id\x18\x0f \x01(\tR\x13serverSideSessionId\x12!\n\x0coperation_id\x18\x0c \x01(\tR\x0boperationId\x12\x1f\n\x0bresponse_id\x18\r \x01(\tR\nresponseId\x12P\n\x0b\x61rrow_batch\x18\x02 \x01(\x0b\x32-.spark.connect.ExecutePlanResponse.ArrowBatchH\x00R\narrowBatch\x12\x63\n\x12sql_command_result\x18\x05 \x01(\x0b\x32\x33.spark.connect.ExecutePlanResponse.SqlCommandResultH\x00R\x10sqlCommandResult\x12~\n#write_stream_operation_start_result\x18\x08 \x01(\x0b\x32..spark.connect.WriteStreamOperationStartResultH\x00R\x1fwriteStreamOperationStartResult\x12q\n\x1estreaming_query_command_result\x18\t \x01(\x0b\x32*.spark.connect.StreamingQueryCommandResultH\x00R\x1bstreamingQueryCommandResult\x12k\n\x1cget_resources_command_result\x18\n \x01(\x0b\x32(.spark.connect.GetResourcesCommandResultH\x00R\x19getResourcesCommandResult\x12\x87\x01\n&streaming_query_manager_command_result\x18\x0b \x01(\x0b\x32\x31.spark.connect.StreamingQueryManagerCommandResultH\x00R"streamingQueryManagerCommandResult\x12\x87\x01\n&streaming_query_listener_events_result\x18\x10 \x01(\x0b\x32\x31.spark.connect.StreamingQueryListenerEventsResultH\x00R"streamingQueryListenerEventsResult\x12\\\n\x0fresult_complete\x18\x0e \x01(\x0b\x32\x31.spark.connect.ExecutePlanResponse.ResultCompleteH\x00R\x0eresultComplete\x12\x87\x01\n&create_resource_profile_command_result\x18\x11 \x01(\x0b\x32\x31.spark.connect.CreateResourceProfileCommandResultH\x00R"createResourceProfileCommandResult\x12\x65\n\x12\x65xecution_progress\x18\x12 \x01(\x0b\x32\x34.spark.connect.ExecutePlanResponse.ExecutionProgressH\x00R\x11\x65xecutionProgress\x12\x35\n\textension\x18\xe7\x07 \x01(\x0b\x32\x14.google.protobuf.AnyH\x00R\textension\x12\x44\n\x07metrics\x18\x04 \x01(\x0b\x32*.spark.connect.ExecutePlanResponse.MetricsR\x07metrics\x12]\n\x10observed_metrics\x18\x06 \x03(\x0b\x32\x32.spark.connect.ExecutePlanResponse.ObservedMetricsR\x0fobservedMetrics\x12/\n\x06schema\x18\x07 \x01(\x0b\x32\x17.spark.connect.DataTypeR\x06schema\x1aG\n\x10SqlCommandResult\x12\x33\n\x08relation\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x08relation\x1av\n\nArrowBatch\x12\x1b\n\trow_count\x18\x01 \x01(\x03R\x08rowCount\x12\x12\n\x04\x64\x61ta\x18\x02 \x01(\x0cR\x04\x64\x61ta\x12&\n\x0cstart_offset\x18\x03 \x01(\x03H\x00R\x0bstartOffset\x88\x01\x01\x42\x0f\n\r_start_offset\x1a\x85\x04\n\x07Metrics\x12Q\n\x07metrics\x18\x01 \x03(\x0b\x32\x37.spark.connect.ExecutePlanResponse.Metrics.MetricObjectR\x07metrics\x1a\xcc\x02\n\x0cMetricObject\x12\x12\n\x04name\x18\x01 \x01(\tR\x04name\x12\x17\n\x07plan_id\x18\x02 \x01(\x03R\x06planId\x12\x16\n\x06parent\x18\x03 \x01(\x03R\x06parent\x12z\n\x11\x65xecution_metrics\x18\x04 \x03(\x0b\x32M.spark.connect.ExecutePlanResponse.Metrics.MetricObject.ExecutionMetricsEntryR\x10\x65xecutionMetrics\x1a{\n\x15\x45xecutionMetricsEntry\x12\x10\n\x03key\x18\x01 \x01(\tR\x03key\x12L\n\x05value\x18\x02 \x01(\x0b\x32\x36.spark.connect.ExecutePlanResponse.Metrics.MetricValueR\x05value:\x02\x38\x01\x1aX\n\x0bMetricValue\x12\x12\n\x04name\x18\x01 \x01(\tR\x04name\x12\x14\n\x05value\x18\x02 \x01(\x03R\x05value\x12\x1f\n\x0bmetric_type\x18\x03 \x01(\tR\nmetricType\x1at\n\x0fObservedMetrics\x12\x12\n\x04name\x18\x01 \x01(\tR\x04name\x12\x39\n\x06values\x18\x02 \x03(\x0b\x32!.spark.connect.Expression.LiteralR\x06values\x12\x12\n\x04keys\x18\x03 \x03(\tR\x04keys\x1a\x10\n\x0eResultComplete\x1a\xcd\x02\n\x11\x45xecutionProgress\x12V\n\x06stages\x18\x01 \x03(\x0b\x32>.spark.connect.ExecutePlanResponse.ExecutionProgress.StageInfoR\x06stages\x12,\n\x12num_inflight_tasks\x18\x02 \x01(\x03R\x10numInflightTasks\x1a\xb1\x01\n\tStageInfo\x12\x19\n\x08stage_id\x18\x01 \x01(\x03R\x07stageId\x12\x1b\n\tnum_tasks\x18\x02 \x01(\x03R\x08numTasks\x12.\n\x13num_completed_tasks\x18\x03 \x01(\x03R\x11numCompletedTasks\x12(\n\x10input_bytes_read\x18\x04 \x01(\x03R\x0einputBytesRead\x12\x12\n\x04\x64one\x18\x05 \x01(\x08R\x04\x64oneB\x0f\n\rresponse_type"A\n\x08KeyValue\x12\x10\n\x03key\x18\x01 \x01(\tR\x03key\x12\x19\n\x05value\x18\x02 \x01(\tH\x00R\x05value\x88\x01\x01\x42\x08\n\x06_value"\x87\t\n\rConfigRequest\x12\x1d\n\nsession_id\x18\x01 \x01(\tR\tsessionId\x12V\n&client_observed_server_side_session_id\x18\x08 \x01(\tH\x00R!clientObservedServerSideSessionId\x88\x01\x01\x12=\n\x0cuser_context\x18\x02 \x01(\x0b\x32\x1a.spark.connect.UserContextR\x0buserContext\x12\x44\n\toperation\x18\x03 \x01(\x0b\x32&.spark.connect.ConfigRequest.OperationR\toperation\x12$\n\x0b\x63lient_type\x18\x04 \x01(\tH\x01R\nclientType\x88\x01\x01\x1a\xf2\x03\n\tOperation\x12\x34\n\x03set\x18\x01 \x01(\x0b\x32 .spark.connect.ConfigRequest.SetH\x00R\x03set\x12\x34\n\x03get\x18\x02 \x01(\x0b\x32 .spark.connect.ConfigRequest.GetH\x00R\x03get\x12W\n\x10get_with_default\x18\x03 \x01(\x0b\x32+.spark.connect.ConfigRequest.GetWithDefaultH\x00R\x0egetWithDefault\x12G\n\nget_option\x18\x04 \x01(\x0b\x32&.spark.connect.ConfigRequest.GetOptionH\x00R\tgetOption\x12>\n\x07get_all\x18\x05 \x01(\x0b\x32#.spark.connect.ConfigRequest.GetAllH\x00R\x06getAll\x12:\n\x05unset\x18\x06 \x01(\x0b\x32".spark.connect.ConfigRequest.UnsetH\x00R\x05unset\x12P\n\ris_modifiable\x18\x07 \x01(\x0b\x32).spark.connect.ConfigRequest.IsModifiableH\x00R\x0cisModifiableB\t\n\x07op_type\x1a\x34\n\x03Set\x12-\n\x05pairs\x18\x01 \x03(\x0b\x32\x17.spark.connect.KeyValueR\x05pairs\x1a\x19\n\x03Get\x12\x12\n\x04keys\x18\x01 \x03(\tR\x04keys\x1a?\n\x0eGetWithDefault\x12-\n\x05pairs\x18\x01 \x03(\x0b\x32\x17.spark.connect.KeyValueR\x05pairs\x1a\x1f\n\tGetOption\x12\x12\n\x04keys\x18\x01 \x03(\tR\x04keys\x1a\x30\n\x06GetAll\x12\x1b\n\x06prefix\x18\x01 \x01(\tH\x00R\x06prefix\x88\x01\x01\x42\t\n\x07_prefix\x1a\x1b\n\x05Unset\x12\x12\n\x04keys\x18\x01 \x03(\tR\x04keys\x1a"\n\x0cIsModifiable\x12\x12\n\x04keys\x18\x01 \x03(\tR\x04keysB)\n\'_client_observed_server_side_session_idB\x0e\n\x0c_client_type"\xaf\x01\n\x0e\x43onfigResponse\x12\x1d\n\nsession_id\x18\x01 \x01(\tR\tsessionId\x12\x33\n\x16server_side_session_id\x18\x04 \x01(\tR\x13serverSideSessionId\x12-\n\x05pairs\x18\x02 \x03(\x0b\x32\x17.spark.connect.KeyValueR\x05pairs\x12\x1a\n\x08warnings\x18\x03 \x03(\tR\x08warnings"\xea\x07\n\x13\x41\x64\x64\x41rtifactsRequest\x12\x1d\n\nsession_id\x18\x01 \x01(\tR\tsessionId\x12=\n\x0cuser_context\x18\x02 \x01(\x0b\x32\x1a.spark.connect.UserContextR\x0buserContext\x12V\n&client_observed_server_side_session_id\x18\x07 \x01(\tH\x01R!clientObservedServerSideSessionId\x88\x01\x01\x12$\n\x0b\x63lient_type\x18\x06 \x01(\tH\x02R\nclientType\x88\x01\x01\x12@\n\x05\x62\x61tch\x18\x03 \x01(\x0b\x32(.spark.connect.AddArtifactsRequest.BatchH\x00R\x05\x62\x61tch\x12Z\n\x0b\x62\x65gin_chunk\x18\x04 \x01(\x0b\x32\x37.spark.connect.AddArtifactsRequest.BeginChunkedArtifactH\x00R\nbeginChunk\x12H\n\x05\x63hunk\x18\x05 \x01(\x0b\x32\x30.spark.connect.AddArtifactsRequest.ArtifactChunkH\x00R\x05\x63hunk\x1a\x35\n\rArtifactChunk\x12\x12\n\x04\x64\x61ta\x18\x01 \x01(\x0cR\x04\x64\x61ta\x12\x10\n\x03\x63rc\x18\x02 \x01(\x03R\x03\x63rc\x1ao\n\x13SingleChunkArtifact\x12\x12\n\x04name\x18\x01 \x01(\tR\x04name\x12\x44\n\x04\x64\x61ta\x18\x02 \x01(\x0b\x32\x30.spark.connect.AddArtifactsRequest.ArtifactChunkR\x04\x64\x61ta\x1a]\n\x05\x42\x61tch\x12T\n\tartifacts\x18\x01 \x03(\x0b\x32\x36.spark.connect.AddArtifactsRequest.SingleChunkArtifactR\tartifacts\x1a\xc1\x01\n\x14\x42\x65ginChunkedArtifact\x12\x12\n\x04name\x18\x01 \x01(\tR\x04name\x12\x1f\n\x0btotal_bytes\x18\x02 \x01(\x03R\ntotalBytes\x12\x1d\n\nnum_chunks\x18\x03 \x01(\x03R\tnumChunks\x12U\n\rinitial_chunk\x18\x04 \x01(\x0b\x32\x30.spark.connect.AddArtifactsRequest.ArtifactChunkR\x0cinitialChunkB\t\n\x07payloadB)\n\'_client_observed_server_side_session_idB\x0e\n\x0c_client_type"\x90\x02\n\x14\x41\x64\x64\x41rtifactsResponse\x12\x1d\n\nsession_id\x18\x02 \x01(\tR\tsessionId\x12\x33\n\x16server_side_session_id\x18\x03 \x01(\tR\x13serverSideSessionId\x12Q\n\tartifacts\x18\x01 \x03(\x0b\x32\x33.spark.connect.AddArtifactsResponse.ArtifactSummaryR\tartifacts\x1aQ\n\x0f\x41rtifactSummary\x12\x12\n\x04name\x18\x01 \x01(\tR\x04name\x12*\n\x11is_crc_successful\x18\x02 \x01(\x08R\x0fisCrcSuccessful"\xc6\x02\n\x17\x41rtifactStatusesRequest\x12\x1d\n\nsession_id\x18\x01 \x01(\tR\tsessionId\x12V\n&client_observed_server_side_session_id\x18\x05 \x01(\tH\x00R!clientObservedServerSideSessionId\x88\x01\x01\x12=\n\x0cuser_context\x18\x02 \x01(\x0b\x32\x1a.spark.connect.UserContextR\x0buserContext\x12$\n\x0b\x63lient_type\x18\x03 \x01(\tH\x01R\nclientType\x88\x01\x01\x12\x14\n\x05names\x18\x04 \x03(\tR\x05namesB)\n\'_client_observed_server_side_session_idB\x0e\n\x0c_client_type"\xe0\x02\n\x18\x41rtifactStatusesResponse\x12\x1d\n\nsession_id\x18\x02 \x01(\tR\tsessionId\x12\x33\n\x16server_side_session_id\x18\x03 \x01(\tR\x13serverSideSessionId\x12Q\n\x08statuses\x18\x01 \x03(\x0b\x32\x35.spark.connect.ArtifactStatusesResponse.StatusesEntryR\x08statuses\x1as\n\rStatusesEntry\x12\x10\n\x03key\x18\x01 \x01(\tR\x03key\x12L\n\x05value\x18\x02 \x01(\x0b\x32\x36.spark.connect.ArtifactStatusesResponse.ArtifactStatusR\x05value:\x02\x38\x01\x1a(\n\x0e\x41rtifactStatus\x12\x16\n\x06\x65xists\x18\x01 \x01(\x08R\x06\x65xists"\xdb\x04\n\x10InterruptRequest\x12\x1d\n\nsession_id\x18\x01 \x01(\tR\tsessionId\x12V\n&client_observed_server_side_session_id\x18\x07 \x01(\tH\x01R!clientObservedServerSideSessionId\x88\x01\x01\x12=\n\x0cuser_context\x18\x02 \x01(\x0b\x32\x1a.spark.connect.UserContextR\x0buserContext\x12$\n\x0b\x63lient_type\x18\x03 \x01(\tH\x02R\nclientType\x88\x01\x01\x12T\n\x0einterrupt_type\x18\x04 \x01(\x0e\x32-.spark.connect.InterruptRequest.InterruptTypeR\rinterruptType\x12%\n\roperation_tag\x18\x05 \x01(\tH\x00R\x0coperationTag\x12#\n\x0coperation_id\x18\x06 \x01(\tH\x00R\x0boperationId"\x80\x01\n\rInterruptType\x12\x1e\n\x1aINTERRUPT_TYPE_UNSPECIFIED\x10\x00\x12\x16\n\x12INTERRUPT_TYPE_ALL\x10\x01\x12\x16\n\x12INTERRUPT_TYPE_TAG\x10\x02\x12\x1f\n\x1bINTERRUPT_TYPE_OPERATION_ID\x10\x03\x42\x0b\n\tinterruptB)\n\'_client_observed_server_side_session_idB\x0e\n\x0c_client_type"\x90\x01\n\x11InterruptResponse\x12\x1d\n\nsession_id\x18\x01 \x01(\tR\tsessionId\x12\x33\n\x16server_side_session_id\x18\x03 \x01(\tR\x13serverSideSessionId\x12\'\n\x0finterrupted_ids\x18\x02 \x03(\tR\x0einterruptedIds"5\n\x0fReattachOptions\x12"\n\x0creattachable\x18\x01 \x01(\x08R\x0creattachable"\x96\x03\n\x16ReattachExecuteRequest\x12\x1d\n\nsession_id\x18\x01 \x01(\tR\tsessionId\x12V\n&client_observed_server_side_session_id\x18\x06 \x01(\tH\x00R!clientObservedServerSideSessionId\x88\x01\x01\x12=\n\x0cuser_context\x18\x02 \x01(\x0b\x32\x1a.spark.connect.UserContextR\x0buserContext\x12!\n\x0coperation_id\x18\x03 \x01(\tR\x0boperationId\x12$\n\x0b\x63lient_type\x18\x04 \x01(\tH\x01R\nclientType\x88\x01\x01\x12-\n\x10last_response_id\x18\x05 \x01(\tH\x02R\x0elastResponseId\x88\x01\x01\x42)\n\'_client_observed_server_side_session_idB\x0e\n\x0c_client_typeB\x13\n\x11_last_response_id"\xc9\x04\n\x15ReleaseExecuteRequest\x12\x1d\n\nsession_id\x18\x01 \x01(\tR\tsessionId\x12V\n&client_observed_server_side_session_id\x18\x07 \x01(\tH\x01R!clientObservedServerSideSessionId\x88\x01\x01\x12=\n\x0cuser_context\x18\x02 \x01(\x0b\x32\x1a.spark.connect.UserContextR\x0buserContext\x12!\n\x0coperation_id\x18\x03 \x01(\tR\x0boperationId\x12$\n\x0b\x63lient_type\x18\x04 \x01(\tH\x02R\nclientType\x88\x01\x01\x12R\n\x0brelease_all\x18\x05 \x01(\x0b\x32/.spark.connect.ReleaseExecuteRequest.ReleaseAllH\x00R\nreleaseAll\x12X\n\rrelease_until\x18\x06 \x01(\x0b\x32\x31.spark.connect.ReleaseExecuteRequest.ReleaseUntilH\x00R\x0creleaseUntil\x1a\x0c\n\nReleaseAll\x1a/\n\x0cReleaseUntil\x12\x1f\n\x0bresponse_id\x18\x01 \x01(\tR\nresponseIdB\t\n\x07releaseB)\n\'_client_observed_server_side_session_idB\x0e\n\x0c_client_type"\xa5\x01\n\x16ReleaseExecuteResponse\x12\x1d\n\nsession_id\x18\x01 \x01(\tR\tsessionId\x12\x33\n\x16server_side_session_id\x18\x03 \x01(\tR\x13serverSideSessionId\x12&\n\x0coperation_id\x18\x02 \x01(\tH\x00R\x0boperationId\x88\x01\x01\x42\x0f\n\r_operation_id"\xab\x01\n\x15ReleaseSessionRequest\x12\x1d\n\nsession_id\x18\x01 \x01(\tR\tsessionId\x12=\n\x0cuser_context\x18\x02 \x01(\x0b\x32\x1a.spark.connect.UserContextR\x0buserContext\x12$\n\x0b\x63lient_type\x18\x03 \x01(\tH\x00R\nclientType\x88\x01\x01\x42\x0e\n\x0c_client_type"l\n\x16ReleaseSessionResponse\x12\x1d\n\nsession_id\x18\x01 \x01(\tR\tsessionId\x12\x33\n\x16server_side_session_id\x18\x02 \x01(\tR\x13serverSideSessionId"\xcc\x02\n\x18\x46\x65tchErrorDetailsRequest\x12\x1d\n\nsession_id\x18\x01 \x01(\tR\tsessionId\x12V\n&client_observed_server_side_session_id\x18\x05 \x01(\tH\x00R!clientObservedServerSideSessionId\x88\x01\x01\x12=\n\x0cuser_context\x18\x02 \x01(\x0b\x32\x1a.spark.connect.UserContextR\x0buserContext\x12\x19\n\x08\x65rror_id\x18\x03 \x01(\tR\x07\x65rrorId\x12$\n\x0b\x63lient_type\x18\x04 \x01(\tH\x01R\nclientType\x88\x01\x01\x42)\n\'_client_observed_server_side_session_idB\x0e\n\x0c_client_type"\x93\x0c\n\x19\x46\x65tchErrorDetailsResponse\x12\x33\n\x16server_side_session_id\x18\x03 \x01(\tR\x13serverSideSessionId\x12\x1d\n\nsession_id\x18\x04 \x01(\tR\tsessionId\x12)\n\x0eroot_error_idx\x18\x01 \x01(\x05H\x00R\x0crootErrorIdx\x88\x01\x01\x12\x46\n\x06\x65rrors\x18\x02 \x03(\x0b\x32..spark.connect.FetchErrorDetailsResponse.ErrorR\x06\x65rrors\x1a\xae\x01\n\x11StackTraceElement\x12\'\n\x0f\x64\x65\x63laring_class\x18\x01 \x01(\tR\x0e\x64\x65\x63laringClass\x12\x1f\n\x0bmethod_name\x18\x02 \x01(\tR\nmethodName\x12 \n\tfile_name\x18\x03 \x01(\tH\x00R\x08\x66ileName\x88\x01\x01\x12\x1f\n\x0bline_number\x18\x04 \x01(\x05R\nlineNumberB\x0c\n\n_file_name\x1a\xf0\x02\n\x0cQueryContext\x12\x64\n\x0c\x63ontext_type\x18\n \x01(\x0e\x32\x41.spark.connect.FetchErrorDetailsResponse.QueryContext.ContextTypeR\x0b\x63ontextType\x12\x1f\n\x0bobject_type\x18\x01 \x01(\tR\nobjectType\x12\x1f\n\x0bobject_name\x18\x02 \x01(\tR\nobjectName\x12\x1f\n\x0bstart_index\x18\x03 \x01(\x05R\nstartIndex\x12\x1d\n\nstop_index\x18\x04 \x01(\x05R\tstopIndex\x12\x1a\n\x08\x66ragment\x18\x05 \x01(\tR\x08\x66ragment\x12\x1b\n\tcall_site\x18\x06 \x01(\tR\x08\x63\x61llSite\x12\x18\n\x07summary\x18\x07 \x01(\tR\x07summary"%\n\x0b\x43ontextType\x12\x07\n\x03SQL\x10\x00\x12\r\n\tDATAFRAME\x10\x01\x1a\x99\x03\n\x0eSparkThrowable\x12$\n\x0b\x65rror_class\x18\x01 \x01(\tH\x00R\nerrorClass\x88\x01\x01\x12}\n\x12message_parameters\x18\x02 \x03(\x0b\x32N.spark.connect.FetchErrorDetailsResponse.SparkThrowable.MessageParametersEntryR\x11messageParameters\x12\\\n\x0equery_contexts\x18\x03 \x03(\x0b\x32\x35.spark.connect.FetchErrorDetailsResponse.QueryContextR\rqueryContexts\x12 \n\tsql_state\x18\x04 \x01(\tH\x01R\x08sqlState\x88\x01\x01\x1a\x44\n\x16MessageParametersEntry\x12\x10\n\x03key\x18\x01 \x01(\tR\x03key\x12\x14\n\x05value\x18\x02 \x01(\tR\x05value:\x02\x38\x01\x42\x0e\n\x0c_error_classB\x0c\n\n_sql_state\x1a\xdb\x02\n\x05\x45rror\x12\x30\n\x14\x65rror_type_hierarchy\x18\x01 \x03(\tR\x12\x65rrorTypeHierarchy\x12\x18\n\x07message\x18\x02 \x01(\tR\x07message\x12[\n\x0bstack_trace\x18\x03 \x03(\x0b\x32:.spark.connect.FetchErrorDetailsResponse.StackTraceElementR\nstackTrace\x12 \n\tcause_idx\x18\x04 \x01(\x05H\x00R\x08\x63\x61useIdx\x88\x01\x01\x12\x65\n\x0fspark_throwable\x18\x05 \x01(\x0b\x32\x37.spark.connect.FetchErrorDetailsResponse.SparkThrowableH\x01R\x0esparkThrowable\x88\x01\x01\x42\x0c\n\n_cause_idxB\x12\n\x10_spark_throwableB\x11\n\x0f_root_error_idx2\xb2\x07\n\x13SparkConnectService\x12X\n\x0b\x45xecutePlan\x12!.spark.connect.ExecutePlanRequest\x1a".spark.connect.ExecutePlanResponse"\x00\x30\x01\x12V\n\x0b\x41nalyzePlan\x12!.spark.connect.AnalyzePlanRequest\x1a".spark.connect.AnalyzePlanResponse"\x00\x12G\n\x06\x43onfig\x12\x1c.spark.connect.ConfigRequest\x1a\x1d.spark.connect.ConfigResponse"\x00\x12[\n\x0c\x41\x64\x64\x41rtifacts\x12".spark.connect.AddArtifactsRequest\x1a#.spark.connect.AddArtifactsResponse"\x00(\x01\x12\x63\n\x0e\x41rtifactStatus\x12&.spark.connect.ArtifactStatusesRequest\x1a\'.spark.connect.ArtifactStatusesResponse"\x00\x12P\n\tInterrupt\x12\x1f.spark.connect.InterruptRequest\x1a .spark.connect.InterruptResponse"\x00\x12`\n\x0fReattachExecute\x12%.spark.connect.ReattachExecuteRequest\x1a".spark.connect.ExecutePlanResponse"\x00\x30\x01\x12_\n\x0eReleaseExecute\x12$.spark.connect.ReleaseExecuteRequest\x1a%.spark.connect.ReleaseExecuteResponse"\x00\x12_\n\x0eReleaseSession\x12$.spark.connect.ReleaseSessionRequest\x1a%.spark.connect.ReleaseSessionResponse"\x00\x12h\n\x11\x46\x65tchErrorDetails\x12\'.spark.connect.FetchErrorDetailsRequest\x1a(.spark.connect.FetchErrorDetailsResponse"\x00\x42\x36\n\x1eorg.apache.spark.connect.protoP\x01Z\x12internal/generatedb\x06proto3' + b'\n\x18spark/connect/base.proto\x12\rspark.connect\x1a\x19google/protobuf/any.proto\x1a\x1cspark/connect/commands.proto\x1a\x1aspark/connect/common.proto\x1a\x1fspark/connect/expressions.proto\x1a\x1dspark/connect/relations.proto\x1a\x19spark/connect/types.proto"t\n\x04Plan\x12-\n\x04root\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationH\x00R\x04root\x12\x32\n\x07\x63ommand\x18\x02 \x01(\x0b\x32\x16.spark.connect.CommandH\x00R\x07\x63ommandB\t\n\x07op_type"z\n\x0bUserContext\x12\x17\n\x07user_id\x18\x01 \x01(\tR\x06userId\x12\x1b\n\tuser_name\x18\x02 \x01(\tR\x08userName\x12\x35\n\nextensions\x18\xe7\x07 \x03(\x0b\x32\x14.google.protobuf.AnyR\nextensions"\xf8\x13\n\x12\x41nalyzePlanRequest\x12\x1d\n\nsession_id\x18\x01 \x01(\tR\tsessionId\x12V\n&client_observed_server_side_session_id\x18\x11 \x01(\tH\x01R!clientObservedServerSideSessionId\x88\x01\x01\x12=\n\x0cuser_context\x18\x02 \x01(\x0b\x32\x1a.spark.connect.UserContextR\x0buserContext\x12$\n\x0b\x63lient_type\x18\x03 \x01(\tH\x02R\nclientType\x88\x01\x01\x12\x42\n\x06schema\x18\x04 \x01(\x0b\x32(.spark.connect.AnalyzePlanRequest.SchemaH\x00R\x06schema\x12\x45\n\x07\x65xplain\x18\x05 \x01(\x0b\x32).spark.connect.AnalyzePlanRequest.ExplainH\x00R\x07\x65xplain\x12O\n\x0btree_string\x18\x06 \x01(\x0b\x32,.spark.connect.AnalyzePlanRequest.TreeStringH\x00R\ntreeString\x12\x46\n\x08is_local\x18\x07 \x01(\x0b\x32).spark.connect.AnalyzePlanRequest.IsLocalH\x00R\x07isLocal\x12R\n\x0cis_streaming\x18\x08 \x01(\x0b\x32-.spark.connect.AnalyzePlanRequest.IsStreamingH\x00R\x0bisStreaming\x12O\n\x0binput_files\x18\t \x01(\x0b\x32,.spark.connect.AnalyzePlanRequest.InputFilesH\x00R\ninputFiles\x12U\n\rspark_version\x18\n \x01(\x0b\x32..spark.connect.AnalyzePlanRequest.SparkVersionH\x00R\x0csparkVersion\x12I\n\tddl_parse\x18\x0b \x01(\x0b\x32*.spark.connect.AnalyzePlanRequest.DDLParseH\x00R\x08\x64\x64lParse\x12X\n\x0esame_semantics\x18\x0c \x01(\x0b\x32/.spark.connect.AnalyzePlanRequest.SameSemanticsH\x00R\rsameSemantics\x12U\n\rsemantic_hash\x18\r \x01(\x0b\x32..spark.connect.AnalyzePlanRequest.SemanticHashH\x00R\x0csemanticHash\x12\x45\n\x07persist\x18\x0e \x01(\x0b\x32).spark.connect.AnalyzePlanRequest.PersistH\x00R\x07persist\x12K\n\tunpersist\x18\x0f \x01(\x0b\x32+.spark.connect.AnalyzePlanRequest.UnpersistH\x00R\tunpersist\x12_\n\x11get_storage_level\x18\x10 \x01(\x0b\x32\x31.spark.connect.AnalyzePlanRequest.GetStorageLevelH\x00R\x0fgetStorageLevel\x1a\x31\n\x06Schema\x12\'\n\x04plan\x18\x01 \x01(\x0b\x32\x13.spark.connect.PlanR\x04plan\x1a\xbb\x02\n\x07\x45xplain\x12\'\n\x04plan\x18\x01 \x01(\x0b\x32\x13.spark.connect.PlanR\x04plan\x12X\n\x0c\x65xplain_mode\x18\x02 \x01(\x0e\x32\x35.spark.connect.AnalyzePlanRequest.Explain.ExplainModeR\x0b\x65xplainMode"\xac\x01\n\x0b\x45xplainMode\x12\x1c\n\x18\x45XPLAIN_MODE_UNSPECIFIED\x10\x00\x12\x17\n\x13\x45XPLAIN_MODE_SIMPLE\x10\x01\x12\x19\n\x15\x45XPLAIN_MODE_EXTENDED\x10\x02\x12\x18\n\x14\x45XPLAIN_MODE_CODEGEN\x10\x03\x12\x15\n\x11\x45XPLAIN_MODE_COST\x10\x04\x12\x1a\n\x16\x45XPLAIN_MODE_FORMATTED\x10\x05\x1aZ\n\nTreeString\x12\'\n\x04plan\x18\x01 \x01(\x0b\x32\x13.spark.connect.PlanR\x04plan\x12\x19\n\x05level\x18\x02 \x01(\x05H\x00R\x05level\x88\x01\x01\x42\x08\n\x06_level\x1a\x32\n\x07IsLocal\x12\'\n\x04plan\x18\x01 \x01(\x0b\x32\x13.spark.connect.PlanR\x04plan\x1a\x36\n\x0bIsStreaming\x12\'\n\x04plan\x18\x01 \x01(\x0b\x32\x13.spark.connect.PlanR\x04plan\x1a\x35\n\nInputFiles\x12\'\n\x04plan\x18\x01 \x01(\x0b\x32\x13.spark.connect.PlanR\x04plan\x1a\x0e\n\x0cSparkVersion\x1a)\n\x08\x44\x44LParse\x12\x1d\n\nddl_string\x18\x01 \x01(\tR\tddlString\x1ay\n\rSameSemantics\x12\x34\n\x0btarget_plan\x18\x01 \x01(\x0b\x32\x13.spark.connect.PlanR\ntargetPlan\x12\x32\n\nother_plan\x18\x02 \x01(\x0b\x32\x13.spark.connect.PlanR\totherPlan\x1a\x37\n\x0cSemanticHash\x12\'\n\x04plan\x18\x01 \x01(\x0b\x32\x13.spark.connect.PlanR\x04plan\x1a\x97\x01\n\x07Persist\x12\x33\n\x08relation\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x08relation\x12\x45\n\rstorage_level\x18\x02 \x01(\x0b\x32\x1b.spark.connect.StorageLevelH\x00R\x0cstorageLevel\x88\x01\x01\x42\x10\n\x0e_storage_level\x1an\n\tUnpersist\x12\x33\n\x08relation\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x08relation\x12\x1f\n\x08\x62locking\x18\x02 \x01(\x08H\x00R\x08\x62locking\x88\x01\x01\x42\x0b\n\t_blocking\x1a\x46\n\x0fGetStorageLevel\x12\x33\n\x08relation\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x08relationB\t\n\x07\x61nalyzeB)\n\'_client_observed_server_side_session_idB\x0e\n\x0c_client_type"\xce\r\n\x13\x41nalyzePlanResponse\x12\x1d\n\nsession_id\x18\x01 \x01(\tR\tsessionId\x12\x33\n\x16server_side_session_id\x18\x0f \x01(\tR\x13serverSideSessionId\x12\x43\n\x06schema\x18\x02 \x01(\x0b\x32).spark.connect.AnalyzePlanResponse.SchemaH\x00R\x06schema\x12\x46\n\x07\x65xplain\x18\x03 \x01(\x0b\x32*.spark.connect.AnalyzePlanResponse.ExplainH\x00R\x07\x65xplain\x12P\n\x0btree_string\x18\x04 \x01(\x0b\x32-.spark.connect.AnalyzePlanResponse.TreeStringH\x00R\ntreeString\x12G\n\x08is_local\x18\x05 \x01(\x0b\x32*.spark.connect.AnalyzePlanResponse.IsLocalH\x00R\x07isLocal\x12S\n\x0cis_streaming\x18\x06 \x01(\x0b\x32..spark.connect.AnalyzePlanResponse.IsStreamingH\x00R\x0bisStreaming\x12P\n\x0binput_files\x18\x07 \x01(\x0b\x32-.spark.connect.AnalyzePlanResponse.InputFilesH\x00R\ninputFiles\x12V\n\rspark_version\x18\x08 \x01(\x0b\x32/.spark.connect.AnalyzePlanResponse.SparkVersionH\x00R\x0csparkVersion\x12J\n\tddl_parse\x18\t \x01(\x0b\x32+.spark.connect.AnalyzePlanResponse.DDLParseH\x00R\x08\x64\x64lParse\x12Y\n\x0esame_semantics\x18\n \x01(\x0b\x32\x30.spark.connect.AnalyzePlanResponse.SameSemanticsH\x00R\rsameSemantics\x12V\n\rsemantic_hash\x18\x0b \x01(\x0b\x32/.spark.connect.AnalyzePlanResponse.SemanticHashH\x00R\x0csemanticHash\x12\x46\n\x07persist\x18\x0c \x01(\x0b\x32*.spark.connect.AnalyzePlanResponse.PersistH\x00R\x07persist\x12L\n\tunpersist\x18\r \x01(\x0b\x32,.spark.connect.AnalyzePlanResponse.UnpersistH\x00R\tunpersist\x12`\n\x11get_storage_level\x18\x0e \x01(\x0b\x32\x32.spark.connect.AnalyzePlanResponse.GetStorageLevelH\x00R\x0fgetStorageLevel\x1a\x39\n\x06Schema\x12/\n\x06schema\x18\x01 \x01(\x0b\x32\x17.spark.connect.DataTypeR\x06schema\x1a\x30\n\x07\x45xplain\x12%\n\x0e\x65xplain_string\x18\x01 \x01(\tR\rexplainString\x1a-\n\nTreeString\x12\x1f\n\x0btree_string\x18\x01 \x01(\tR\ntreeString\x1a$\n\x07IsLocal\x12\x19\n\x08is_local\x18\x01 \x01(\x08R\x07isLocal\x1a\x30\n\x0bIsStreaming\x12!\n\x0cis_streaming\x18\x01 \x01(\x08R\x0bisStreaming\x1a"\n\nInputFiles\x12\x14\n\x05\x66iles\x18\x01 \x03(\tR\x05\x66iles\x1a(\n\x0cSparkVersion\x12\x18\n\x07version\x18\x01 \x01(\tR\x07version\x1a;\n\x08\x44\x44LParse\x12/\n\x06parsed\x18\x01 \x01(\x0b\x32\x17.spark.connect.DataTypeR\x06parsed\x1a\'\n\rSameSemantics\x12\x16\n\x06result\x18\x01 \x01(\x08R\x06result\x1a&\n\x0cSemanticHash\x12\x16\n\x06result\x18\x01 \x01(\x05R\x06result\x1a\t\n\x07Persist\x1a\x0b\n\tUnpersist\x1aS\n\x0fGetStorageLevel\x12@\n\rstorage_level\x18\x01 \x01(\x0b\x32\x1b.spark.connect.StorageLevelR\x0cstorageLevelB\x08\n\x06result"\xa3\x05\n\x12\x45xecutePlanRequest\x12\x1d\n\nsession_id\x18\x01 \x01(\tR\tsessionId\x12V\n&client_observed_server_side_session_id\x18\x08 \x01(\tH\x00R!clientObservedServerSideSessionId\x88\x01\x01\x12=\n\x0cuser_context\x18\x02 \x01(\x0b\x32\x1a.spark.connect.UserContextR\x0buserContext\x12&\n\x0coperation_id\x18\x06 \x01(\tH\x01R\x0boperationId\x88\x01\x01\x12\'\n\x04plan\x18\x03 \x01(\x0b\x32\x13.spark.connect.PlanR\x04plan\x12$\n\x0b\x63lient_type\x18\x04 \x01(\tH\x02R\nclientType\x88\x01\x01\x12X\n\x0frequest_options\x18\x05 \x03(\x0b\x32/.spark.connect.ExecutePlanRequest.RequestOptionR\x0erequestOptions\x12\x12\n\x04tags\x18\x07 \x03(\tR\x04tags\x1a\xa5\x01\n\rRequestOption\x12K\n\x10reattach_options\x18\x01 \x01(\x0b\x32\x1e.spark.connect.ReattachOptionsH\x00R\x0freattachOptions\x12\x35\n\textension\x18\xe7\x07 \x01(\x0b\x32\x14.google.protobuf.AnyH\x00R\textensionB\x10\n\x0erequest_optionB)\n\'_client_observed_server_side_session_idB\x0f\n\r_operation_idB\x0e\n\x0c_client_type"\x80\x16\n\x13\x45xecutePlanResponse\x12\x1d\n\nsession_id\x18\x01 \x01(\tR\tsessionId\x12\x33\n\x16server_side_session_id\x18\x0f \x01(\tR\x13serverSideSessionId\x12!\n\x0coperation_id\x18\x0c \x01(\tR\x0boperationId\x12\x1f\n\x0bresponse_id\x18\r \x01(\tR\nresponseId\x12P\n\x0b\x61rrow_batch\x18\x02 \x01(\x0b\x32-.spark.connect.ExecutePlanResponse.ArrowBatchH\x00R\narrowBatch\x12\x63\n\x12sql_command_result\x18\x05 \x01(\x0b\x32\x33.spark.connect.ExecutePlanResponse.SqlCommandResultH\x00R\x10sqlCommandResult\x12~\n#write_stream_operation_start_result\x18\x08 \x01(\x0b\x32..spark.connect.WriteStreamOperationStartResultH\x00R\x1fwriteStreamOperationStartResult\x12q\n\x1estreaming_query_command_result\x18\t \x01(\x0b\x32*.spark.connect.StreamingQueryCommandResultH\x00R\x1bstreamingQueryCommandResult\x12k\n\x1cget_resources_command_result\x18\n \x01(\x0b\x32(.spark.connect.GetResourcesCommandResultH\x00R\x19getResourcesCommandResult\x12\x87\x01\n&streaming_query_manager_command_result\x18\x0b \x01(\x0b\x32\x31.spark.connect.StreamingQueryManagerCommandResultH\x00R"streamingQueryManagerCommandResult\x12\x87\x01\n&streaming_query_listener_events_result\x18\x10 \x01(\x0b\x32\x31.spark.connect.StreamingQueryListenerEventsResultH\x00R"streamingQueryListenerEventsResult\x12\\\n\x0fresult_complete\x18\x0e \x01(\x0b\x32\x31.spark.connect.ExecutePlanResponse.ResultCompleteH\x00R\x0eresultComplete\x12\x87\x01\n&create_resource_profile_command_result\x18\x11 \x01(\x0b\x32\x31.spark.connect.CreateResourceProfileCommandResultH\x00R"createResourceProfileCommandResult\x12\x65\n\x12\x65xecution_progress\x18\x12 \x01(\x0b\x32\x34.spark.connect.ExecutePlanResponse.ExecutionProgressH\x00R\x11\x65xecutionProgress\x12\x35\n\textension\x18\xe7\x07 \x01(\x0b\x32\x14.google.protobuf.AnyH\x00R\textension\x12\x44\n\x07metrics\x18\x04 \x01(\x0b\x32*.spark.connect.ExecutePlanResponse.MetricsR\x07metrics\x12]\n\x10observed_metrics\x18\x06 \x03(\x0b\x32\x32.spark.connect.ExecutePlanResponse.ObservedMetricsR\x0fobservedMetrics\x12/\n\x06schema\x18\x07 \x01(\x0b\x32\x17.spark.connect.DataTypeR\x06schema\x1aG\n\x10SqlCommandResult\x12\x33\n\x08relation\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x08relation\x1av\n\nArrowBatch\x12\x1b\n\trow_count\x18\x01 \x01(\x03R\x08rowCount\x12\x12\n\x04\x64\x61ta\x18\x02 \x01(\x0cR\x04\x64\x61ta\x12&\n\x0cstart_offset\x18\x03 \x01(\x03H\x00R\x0bstartOffset\x88\x01\x01\x42\x0f\n\r_start_offset\x1a\x85\x04\n\x07Metrics\x12Q\n\x07metrics\x18\x01 \x03(\x0b\x32\x37.spark.connect.ExecutePlanResponse.Metrics.MetricObjectR\x07metrics\x1a\xcc\x02\n\x0cMetricObject\x12\x12\n\x04name\x18\x01 \x01(\tR\x04name\x12\x17\n\x07plan_id\x18\x02 \x01(\x03R\x06planId\x12\x16\n\x06parent\x18\x03 \x01(\x03R\x06parent\x12z\n\x11\x65xecution_metrics\x18\x04 \x03(\x0b\x32M.spark.connect.ExecutePlanResponse.Metrics.MetricObject.ExecutionMetricsEntryR\x10\x65xecutionMetrics\x1a{\n\x15\x45xecutionMetricsEntry\x12\x10\n\x03key\x18\x01 \x01(\tR\x03key\x12L\n\x05value\x18\x02 \x01(\x0b\x32\x36.spark.connect.ExecutePlanResponse.Metrics.MetricValueR\x05value:\x02\x38\x01\x1aX\n\x0bMetricValue\x12\x12\n\x04name\x18\x01 \x01(\tR\x04name\x12\x14\n\x05value\x18\x02 \x01(\x03R\x05value\x12\x1f\n\x0bmetric_type\x18\x03 \x01(\tR\nmetricType\x1a\x8d\x01\n\x0fObservedMetrics\x12\x12\n\x04name\x18\x01 \x01(\tR\x04name\x12\x39\n\x06values\x18\x02 \x03(\x0b\x32!.spark.connect.Expression.LiteralR\x06values\x12\x12\n\x04keys\x18\x03 \x03(\tR\x04keys\x12\x17\n\x07plan_id\x18\x04 \x01(\x03R\x06planId\x1a\x10\n\x0eResultComplete\x1a\xcd\x02\n\x11\x45xecutionProgress\x12V\n\x06stages\x18\x01 \x03(\x0b\x32>.spark.connect.ExecutePlanResponse.ExecutionProgress.StageInfoR\x06stages\x12,\n\x12num_inflight_tasks\x18\x02 \x01(\x03R\x10numInflightTasks\x1a\xb1\x01\n\tStageInfo\x12\x19\n\x08stage_id\x18\x01 \x01(\x03R\x07stageId\x12\x1b\n\tnum_tasks\x18\x02 \x01(\x03R\x08numTasks\x12.\n\x13num_completed_tasks\x18\x03 \x01(\x03R\x11numCompletedTasks\x12(\n\x10input_bytes_read\x18\x04 \x01(\x03R\x0einputBytesRead\x12\x12\n\x04\x64one\x18\x05 \x01(\x08R\x04\x64oneB\x0f\n\rresponse_type"A\n\x08KeyValue\x12\x10\n\x03key\x18\x01 \x01(\tR\x03key\x12\x19\n\x05value\x18\x02 \x01(\tH\x00R\x05value\x88\x01\x01\x42\x08\n\x06_value"\x87\t\n\rConfigRequest\x12\x1d\n\nsession_id\x18\x01 \x01(\tR\tsessionId\x12V\n&client_observed_server_side_session_id\x18\x08 \x01(\tH\x00R!clientObservedServerSideSessionId\x88\x01\x01\x12=\n\x0cuser_context\x18\x02 \x01(\x0b\x32\x1a.spark.connect.UserContextR\x0buserContext\x12\x44\n\toperation\x18\x03 \x01(\x0b\x32&.spark.connect.ConfigRequest.OperationR\toperation\x12$\n\x0b\x63lient_type\x18\x04 \x01(\tH\x01R\nclientType\x88\x01\x01\x1a\xf2\x03\n\tOperation\x12\x34\n\x03set\x18\x01 \x01(\x0b\x32 .spark.connect.ConfigRequest.SetH\x00R\x03set\x12\x34\n\x03get\x18\x02 \x01(\x0b\x32 .spark.connect.ConfigRequest.GetH\x00R\x03get\x12W\n\x10get_with_default\x18\x03 \x01(\x0b\x32+.spark.connect.ConfigRequest.GetWithDefaultH\x00R\x0egetWithDefault\x12G\n\nget_option\x18\x04 \x01(\x0b\x32&.spark.connect.ConfigRequest.GetOptionH\x00R\tgetOption\x12>\n\x07get_all\x18\x05 \x01(\x0b\x32#.spark.connect.ConfigRequest.GetAllH\x00R\x06getAll\x12:\n\x05unset\x18\x06 \x01(\x0b\x32".spark.connect.ConfigRequest.UnsetH\x00R\x05unset\x12P\n\ris_modifiable\x18\x07 \x01(\x0b\x32).spark.connect.ConfigRequest.IsModifiableH\x00R\x0cisModifiableB\t\n\x07op_type\x1a\x34\n\x03Set\x12-\n\x05pairs\x18\x01 \x03(\x0b\x32\x17.spark.connect.KeyValueR\x05pairs\x1a\x19\n\x03Get\x12\x12\n\x04keys\x18\x01 \x03(\tR\x04keys\x1a?\n\x0eGetWithDefault\x12-\n\x05pairs\x18\x01 \x03(\x0b\x32\x17.spark.connect.KeyValueR\x05pairs\x1a\x1f\n\tGetOption\x12\x12\n\x04keys\x18\x01 \x03(\tR\x04keys\x1a\x30\n\x06GetAll\x12\x1b\n\x06prefix\x18\x01 \x01(\tH\x00R\x06prefix\x88\x01\x01\x42\t\n\x07_prefix\x1a\x1b\n\x05Unset\x12\x12\n\x04keys\x18\x01 \x03(\tR\x04keys\x1a"\n\x0cIsModifiable\x12\x12\n\x04keys\x18\x01 \x03(\tR\x04keysB)\n\'_client_observed_server_side_session_idB\x0e\n\x0c_client_type"\xaf\x01\n\x0e\x43onfigResponse\x12\x1d\n\nsession_id\x18\x01 \x01(\tR\tsessionId\x12\x33\n\x16server_side_session_id\x18\x04 \x01(\tR\x13serverSideSessionId\x12-\n\x05pairs\x18\x02 \x03(\x0b\x32\x17.spark.connect.KeyValueR\x05pairs\x12\x1a\n\x08warnings\x18\x03 \x03(\tR\x08warnings"\xea\x07\n\x13\x41\x64\x64\x41rtifactsRequest\x12\x1d\n\nsession_id\x18\x01 \x01(\tR\tsessionId\x12=\n\x0cuser_context\x18\x02 \x01(\x0b\x32\x1a.spark.connect.UserContextR\x0buserContext\x12V\n&client_observed_server_side_session_id\x18\x07 \x01(\tH\x01R!clientObservedServerSideSessionId\x88\x01\x01\x12$\n\x0b\x63lient_type\x18\x06 \x01(\tH\x02R\nclientType\x88\x01\x01\x12@\n\x05\x62\x61tch\x18\x03 \x01(\x0b\x32(.spark.connect.AddArtifactsRequest.BatchH\x00R\x05\x62\x61tch\x12Z\n\x0b\x62\x65gin_chunk\x18\x04 \x01(\x0b\x32\x37.spark.connect.AddArtifactsRequest.BeginChunkedArtifactH\x00R\nbeginChunk\x12H\n\x05\x63hunk\x18\x05 \x01(\x0b\x32\x30.spark.connect.AddArtifactsRequest.ArtifactChunkH\x00R\x05\x63hunk\x1a\x35\n\rArtifactChunk\x12\x12\n\x04\x64\x61ta\x18\x01 \x01(\x0cR\x04\x64\x61ta\x12\x10\n\x03\x63rc\x18\x02 \x01(\x03R\x03\x63rc\x1ao\n\x13SingleChunkArtifact\x12\x12\n\x04name\x18\x01 \x01(\tR\x04name\x12\x44\n\x04\x64\x61ta\x18\x02 \x01(\x0b\x32\x30.spark.connect.AddArtifactsRequest.ArtifactChunkR\x04\x64\x61ta\x1a]\n\x05\x42\x61tch\x12T\n\tartifacts\x18\x01 \x03(\x0b\x32\x36.spark.connect.AddArtifactsRequest.SingleChunkArtifactR\tartifacts\x1a\xc1\x01\n\x14\x42\x65ginChunkedArtifact\x12\x12\n\x04name\x18\x01 \x01(\tR\x04name\x12\x1f\n\x0btotal_bytes\x18\x02 \x01(\x03R\ntotalBytes\x12\x1d\n\nnum_chunks\x18\x03 \x01(\x03R\tnumChunks\x12U\n\rinitial_chunk\x18\x04 \x01(\x0b\x32\x30.spark.connect.AddArtifactsRequest.ArtifactChunkR\x0cinitialChunkB\t\n\x07payloadB)\n\'_client_observed_server_side_session_idB\x0e\n\x0c_client_type"\x90\x02\n\x14\x41\x64\x64\x41rtifactsResponse\x12\x1d\n\nsession_id\x18\x02 \x01(\tR\tsessionId\x12\x33\n\x16server_side_session_id\x18\x03 \x01(\tR\x13serverSideSessionId\x12Q\n\tartifacts\x18\x01 \x03(\x0b\x32\x33.spark.connect.AddArtifactsResponse.ArtifactSummaryR\tartifacts\x1aQ\n\x0f\x41rtifactSummary\x12\x12\n\x04name\x18\x01 \x01(\tR\x04name\x12*\n\x11is_crc_successful\x18\x02 \x01(\x08R\x0fisCrcSuccessful"\xc6\x02\n\x17\x41rtifactStatusesRequest\x12\x1d\n\nsession_id\x18\x01 \x01(\tR\tsessionId\x12V\n&client_observed_server_side_session_id\x18\x05 \x01(\tH\x00R!clientObservedServerSideSessionId\x88\x01\x01\x12=\n\x0cuser_context\x18\x02 \x01(\x0b\x32\x1a.spark.connect.UserContextR\x0buserContext\x12$\n\x0b\x63lient_type\x18\x03 \x01(\tH\x01R\nclientType\x88\x01\x01\x12\x14\n\x05names\x18\x04 \x03(\tR\x05namesB)\n\'_client_observed_server_side_session_idB\x0e\n\x0c_client_type"\xe0\x02\n\x18\x41rtifactStatusesResponse\x12\x1d\n\nsession_id\x18\x02 \x01(\tR\tsessionId\x12\x33\n\x16server_side_session_id\x18\x03 \x01(\tR\x13serverSideSessionId\x12Q\n\x08statuses\x18\x01 \x03(\x0b\x32\x35.spark.connect.ArtifactStatusesResponse.StatusesEntryR\x08statuses\x1as\n\rStatusesEntry\x12\x10\n\x03key\x18\x01 \x01(\tR\x03key\x12L\n\x05value\x18\x02 \x01(\x0b\x32\x36.spark.connect.ArtifactStatusesResponse.ArtifactStatusR\x05value:\x02\x38\x01\x1a(\n\x0e\x41rtifactStatus\x12\x16\n\x06\x65xists\x18\x01 \x01(\x08R\x06\x65xists"\xdb\x04\n\x10InterruptRequest\x12\x1d\n\nsession_id\x18\x01 \x01(\tR\tsessionId\x12V\n&client_observed_server_side_session_id\x18\x07 \x01(\tH\x01R!clientObservedServerSideSessionId\x88\x01\x01\x12=\n\x0cuser_context\x18\x02 \x01(\x0b\x32\x1a.spark.connect.UserContextR\x0buserContext\x12$\n\x0b\x63lient_type\x18\x03 \x01(\tH\x02R\nclientType\x88\x01\x01\x12T\n\x0einterrupt_type\x18\x04 \x01(\x0e\x32-.spark.connect.InterruptRequest.InterruptTypeR\rinterruptType\x12%\n\roperation_tag\x18\x05 \x01(\tH\x00R\x0coperationTag\x12#\n\x0coperation_id\x18\x06 \x01(\tH\x00R\x0boperationId"\x80\x01\n\rInterruptType\x12\x1e\n\x1aINTERRUPT_TYPE_UNSPECIFIED\x10\x00\x12\x16\n\x12INTERRUPT_TYPE_ALL\x10\x01\x12\x16\n\x12INTERRUPT_TYPE_TAG\x10\x02\x12\x1f\n\x1bINTERRUPT_TYPE_OPERATION_ID\x10\x03\x42\x0b\n\tinterruptB)\n\'_client_observed_server_side_session_idB\x0e\n\x0c_client_type"\x90\x01\n\x11InterruptResponse\x12\x1d\n\nsession_id\x18\x01 \x01(\tR\tsessionId\x12\x33\n\x16server_side_session_id\x18\x03 \x01(\tR\x13serverSideSessionId\x12\'\n\x0finterrupted_ids\x18\x02 \x03(\tR\x0einterruptedIds"5\n\x0fReattachOptions\x12"\n\x0creattachable\x18\x01 \x01(\x08R\x0creattachable"\x96\x03\n\x16ReattachExecuteRequest\x12\x1d\n\nsession_id\x18\x01 \x01(\tR\tsessionId\x12V\n&client_observed_server_side_session_id\x18\x06 \x01(\tH\x00R!clientObservedServerSideSessionId\x88\x01\x01\x12=\n\x0cuser_context\x18\x02 \x01(\x0b\x32\x1a.spark.connect.UserContextR\x0buserContext\x12!\n\x0coperation_id\x18\x03 \x01(\tR\x0boperationId\x12$\n\x0b\x63lient_type\x18\x04 \x01(\tH\x01R\nclientType\x88\x01\x01\x12-\n\x10last_response_id\x18\x05 \x01(\tH\x02R\x0elastResponseId\x88\x01\x01\x42)\n\'_client_observed_server_side_session_idB\x0e\n\x0c_client_typeB\x13\n\x11_last_response_id"\xc9\x04\n\x15ReleaseExecuteRequest\x12\x1d\n\nsession_id\x18\x01 \x01(\tR\tsessionId\x12V\n&client_observed_server_side_session_id\x18\x07 \x01(\tH\x01R!clientObservedServerSideSessionId\x88\x01\x01\x12=\n\x0cuser_context\x18\x02 \x01(\x0b\x32\x1a.spark.connect.UserContextR\x0buserContext\x12!\n\x0coperation_id\x18\x03 \x01(\tR\x0boperationId\x12$\n\x0b\x63lient_type\x18\x04 \x01(\tH\x02R\nclientType\x88\x01\x01\x12R\n\x0brelease_all\x18\x05 \x01(\x0b\x32/.spark.connect.ReleaseExecuteRequest.ReleaseAllH\x00R\nreleaseAll\x12X\n\rrelease_until\x18\x06 \x01(\x0b\x32\x31.spark.connect.ReleaseExecuteRequest.ReleaseUntilH\x00R\x0creleaseUntil\x1a\x0c\n\nReleaseAll\x1a/\n\x0cReleaseUntil\x12\x1f\n\x0bresponse_id\x18\x01 \x01(\tR\nresponseIdB\t\n\x07releaseB)\n\'_client_observed_server_side_session_idB\x0e\n\x0c_client_type"\xa5\x01\n\x16ReleaseExecuteResponse\x12\x1d\n\nsession_id\x18\x01 \x01(\tR\tsessionId\x12\x33\n\x16server_side_session_id\x18\x03 \x01(\tR\x13serverSideSessionId\x12&\n\x0coperation_id\x18\x02 \x01(\tH\x00R\x0boperationId\x88\x01\x01\x42\x0f\n\r_operation_id"\xab\x01\n\x15ReleaseSessionRequest\x12\x1d\n\nsession_id\x18\x01 \x01(\tR\tsessionId\x12=\n\x0cuser_context\x18\x02 \x01(\x0b\x32\x1a.spark.connect.UserContextR\x0buserContext\x12$\n\x0b\x63lient_type\x18\x03 \x01(\tH\x00R\nclientType\x88\x01\x01\x42\x0e\n\x0c_client_type"l\n\x16ReleaseSessionResponse\x12\x1d\n\nsession_id\x18\x01 \x01(\tR\tsessionId\x12\x33\n\x16server_side_session_id\x18\x02 \x01(\tR\x13serverSideSessionId"\xcc\x02\n\x18\x46\x65tchErrorDetailsRequest\x12\x1d\n\nsession_id\x18\x01 \x01(\tR\tsessionId\x12V\n&client_observed_server_side_session_id\x18\x05 \x01(\tH\x00R!clientObservedServerSideSessionId\x88\x01\x01\x12=\n\x0cuser_context\x18\x02 \x01(\x0b\x32\x1a.spark.connect.UserContextR\x0buserContext\x12\x19\n\x08\x65rror_id\x18\x03 \x01(\tR\x07\x65rrorId\x12$\n\x0b\x63lient_type\x18\x04 \x01(\tH\x01R\nclientType\x88\x01\x01\x42)\n\'_client_observed_server_side_session_idB\x0e\n\x0c_client_type"\x93\x0c\n\x19\x46\x65tchErrorDetailsResponse\x12\x33\n\x16server_side_session_id\x18\x03 \x01(\tR\x13serverSideSessionId\x12\x1d\n\nsession_id\x18\x04 \x01(\tR\tsessionId\x12)\n\x0eroot_error_idx\x18\x01 \x01(\x05H\x00R\x0crootErrorIdx\x88\x01\x01\x12\x46\n\x06\x65rrors\x18\x02 \x03(\x0b\x32..spark.connect.FetchErrorDetailsResponse.ErrorR\x06\x65rrors\x1a\xae\x01\n\x11StackTraceElement\x12\'\n\x0f\x64\x65\x63laring_class\x18\x01 \x01(\tR\x0e\x64\x65\x63laringClass\x12\x1f\n\x0bmethod_name\x18\x02 \x01(\tR\nmethodName\x12 \n\tfile_name\x18\x03 \x01(\tH\x00R\x08\x66ileName\x88\x01\x01\x12\x1f\n\x0bline_number\x18\x04 \x01(\x05R\nlineNumberB\x0c\n\n_file_name\x1a\xf0\x02\n\x0cQueryContext\x12\x64\n\x0c\x63ontext_type\x18\n \x01(\x0e\x32\x41.spark.connect.FetchErrorDetailsResponse.QueryContext.ContextTypeR\x0b\x63ontextType\x12\x1f\n\x0bobject_type\x18\x01 \x01(\tR\nobjectType\x12\x1f\n\x0bobject_name\x18\x02 \x01(\tR\nobjectName\x12\x1f\n\x0bstart_index\x18\x03 \x01(\x05R\nstartIndex\x12\x1d\n\nstop_index\x18\x04 \x01(\x05R\tstopIndex\x12\x1a\n\x08\x66ragment\x18\x05 \x01(\tR\x08\x66ragment\x12\x1b\n\tcall_site\x18\x06 \x01(\tR\x08\x63\x61llSite\x12\x18\n\x07summary\x18\x07 \x01(\tR\x07summary"%\n\x0b\x43ontextType\x12\x07\n\x03SQL\x10\x00\x12\r\n\tDATAFRAME\x10\x01\x1a\x99\x03\n\x0eSparkThrowable\x12$\n\x0b\x65rror_class\x18\x01 \x01(\tH\x00R\nerrorClass\x88\x01\x01\x12}\n\x12message_parameters\x18\x02 \x03(\x0b\x32N.spark.connect.FetchErrorDetailsResponse.SparkThrowable.MessageParametersEntryR\x11messageParameters\x12\\\n\x0equery_contexts\x18\x03 \x03(\x0b\x32\x35.spark.connect.FetchErrorDetailsResponse.QueryContextR\rqueryContexts\x12 \n\tsql_state\x18\x04 \x01(\tH\x01R\x08sqlState\x88\x01\x01\x1a\x44\n\x16MessageParametersEntry\x12\x10\n\x03key\x18\x01 \x01(\tR\x03key\x12\x14\n\x05value\x18\x02 \x01(\tR\x05value:\x02\x38\x01\x42\x0e\n\x0c_error_classB\x0c\n\n_sql_state\x1a\xdb\x02\n\x05\x45rror\x12\x30\n\x14\x65rror_type_hierarchy\x18\x01 \x03(\tR\x12\x65rrorTypeHierarchy\x12\x18\n\x07message\x18\x02 \x01(\tR\x07message\x12[\n\x0bstack_trace\x18\x03 \x03(\x0b\x32:.spark.connect.FetchErrorDetailsResponse.StackTraceElementR\nstackTrace\x12 \n\tcause_idx\x18\x04 \x01(\x05H\x00R\x08\x63\x61useIdx\x88\x01\x01\x12\x65\n\x0fspark_throwable\x18\x05 \x01(\x0b\x32\x37.spark.connect.FetchErrorDetailsResponse.SparkThrowableH\x01R\x0esparkThrowable\x88\x01\x01\x42\x0c\n\n_cause_idxB\x12\n\x10_spark_throwableB\x11\n\x0f_root_error_idx2\xb2\x07\n\x13SparkConnectService\x12X\n\x0b\x45xecutePlan\x12!.spark.connect.ExecutePlanRequest\x1a".spark.connect.ExecutePlanResponse"\x00\x30\x01\x12V\n\x0b\x41nalyzePlan\x12!.spark.connect.AnalyzePlanRequest\x1a".spark.connect.AnalyzePlanResponse"\x00\x12G\n\x06\x43onfig\x12\x1c.spark.connect.ConfigRequest\x1a\x1d.spark.connect.ConfigResponse"\x00\x12[\n\x0c\x41\x64\x64\x41rtifacts\x12".spark.connect.AddArtifactsRequest\x1a#.spark.connect.AddArtifactsResponse"\x00(\x01\x12\x63\n\x0e\x41rtifactStatus\x12&.spark.connect.ArtifactStatusesRequest\x1a\'.spark.connect.ArtifactStatusesResponse"\x00\x12P\n\tInterrupt\x12\x1f.spark.connect.InterruptRequest\x1a .spark.connect.InterruptResponse"\x00\x12`\n\x0fReattachExecute\x12%.spark.connect.ReattachExecuteRequest\x1a".spark.connect.ExecutePlanResponse"\x00\x30\x01\x12_\n\x0eReleaseExecute\x12$.spark.connect.ReleaseExecuteRequest\x1a%.spark.connect.ReleaseExecuteResponse"\x00\x12_\n\x0eReleaseSession\x12$.spark.connect.ReleaseSessionRequest\x1a%.spark.connect.ReleaseSessionResponse"\x00\x12h\n\x11\x46\x65tchErrorDetails\x12\'.spark.connect.FetchErrorDetailsRequest\x1a(.spark.connect.FetchErrorDetailsResponse"\x00\x42\x36\n\x1eorg.apache.spark.connect.protoP\x01Z\x12internal/generatedb\x06proto3' ) _builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, globals()) @@ -120,7 +120,7 @@ _EXECUTEPLANREQUEST_REQUESTOPTION._serialized_start = 5196 _EXECUTEPLANREQUEST_REQUESTOPTION._serialized_end = 5361 _EXECUTEPLANRESPONSE._serialized_start = 5440 - _EXECUTEPLANRESPONSE._serialized_end = 8230 + _EXECUTEPLANRESPONSE._serialized_end = 8256 _EXECUTEPLANRESPONSE_SQLCOMMANDRESULT._serialized_start = 7030 _EXECUTEPLANRESPONSE_SQLCOMMANDRESULT._serialized_end = 7101 _EXECUTEPLANRESPONSE_ARROWBATCH._serialized_start = 7103 @@ -133,96 +133,96 @@ _EXECUTEPLANRESPONSE_METRICS_METRICOBJECT_EXECUTIONMETRICSENTRY._serialized_end = 7651 _EXECUTEPLANRESPONSE_METRICS_METRICVALUE._serialized_start = 7653 _EXECUTEPLANRESPONSE_METRICS_METRICVALUE._serialized_end = 7741 - _EXECUTEPLANRESPONSE_OBSERVEDMETRICS._serialized_start = 7743 - _EXECUTEPLANRESPONSE_OBSERVEDMETRICS._serialized_end = 7859 - _EXECUTEPLANRESPONSE_RESULTCOMPLETE._serialized_start = 7861 - _EXECUTEPLANRESPONSE_RESULTCOMPLETE._serialized_end = 7877 - _EXECUTEPLANRESPONSE_EXECUTIONPROGRESS._serialized_start = 7880 - _EXECUTEPLANRESPONSE_EXECUTIONPROGRESS._serialized_end = 8213 - _EXECUTEPLANRESPONSE_EXECUTIONPROGRESS_STAGEINFO._serialized_start = 8036 - _EXECUTEPLANRESPONSE_EXECUTIONPROGRESS_STAGEINFO._serialized_end = 8213 - _KEYVALUE._serialized_start = 8232 - _KEYVALUE._serialized_end = 8297 - _CONFIGREQUEST._serialized_start = 8300 - _CONFIGREQUEST._serialized_end = 9459 - _CONFIGREQUEST_OPERATION._serialized_start = 8608 - _CONFIGREQUEST_OPERATION._serialized_end = 9106 - _CONFIGREQUEST_SET._serialized_start = 9108 - _CONFIGREQUEST_SET._serialized_end = 9160 - _CONFIGREQUEST_GET._serialized_start = 9162 - _CONFIGREQUEST_GET._serialized_end = 9187 - _CONFIGREQUEST_GETWITHDEFAULT._serialized_start = 9189 - _CONFIGREQUEST_GETWITHDEFAULT._serialized_end = 9252 - _CONFIGREQUEST_GETOPTION._serialized_start = 9254 - _CONFIGREQUEST_GETOPTION._serialized_end = 9285 - _CONFIGREQUEST_GETALL._serialized_start = 9287 - _CONFIGREQUEST_GETALL._serialized_end = 9335 - _CONFIGREQUEST_UNSET._serialized_start = 9337 - _CONFIGREQUEST_UNSET._serialized_end = 9364 - _CONFIGREQUEST_ISMODIFIABLE._serialized_start = 9366 - _CONFIGREQUEST_ISMODIFIABLE._serialized_end = 9400 - _CONFIGRESPONSE._serialized_start = 9462 - _CONFIGRESPONSE._serialized_end = 9637 - _ADDARTIFACTSREQUEST._serialized_start = 9640 - _ADDARTIFACTSREQUEST._serialized_end = 10642 - _ADDARTIFACTSREQUEST_ARTIFACTCHUNK._serialized_start = 10115 - _ADDARTIFACTSREQUEST_ARTIFACTCHUNK._serialized_end = 10168 - _ADDARTIFACTSREQUEST_SINGLECHUNKARTIFACT._serialized_start = 10170 - _ADDARTIFACTSREQUEST_SINGLECHUNKARTIFACT._serialized_end = 10281 - _ADDARTIFACTSREQUEST_BATCH._serialized_start = 10283 - _ADDARTIFACTSREQUEST_BATCH._serialized_end = 10376 - _ADDARTIFACTSREQUEST_BEGINCHUNKEDARTIFACT._serialized_start = 10379 - _ADDARTIFACTSREQUEST_BEGINCHUNKEDARTIFACT._serialized_end = 10572 - _ADDARTIFACTSRESPONSE._serialized_start = 10645 - _ADDARTIFACTSRESPONSE._serialized_end = 10917 - _ADDARTIFACTSRESPONSE_ARTIFACTSUMMARY._serialized_start = 10836 - _ADDARTIFACTSRESPONSE_ARTIFACTSUMMARY._serialized_end = 10917 - _ARTIFACTSTATUSESREQUEST._serialized_start = 10920 - _ARTIFACTSTATUSESREQUEST._serialized_end = 11246 - _ARTIFACTSTATUSESRESPONSE._serialized_start = 11249 - _ARTIFACTSTATUSESRESPONSE._serialized_end = 11601 - _ARTIFACTSTATUSESRESPONSE_STATUSESENTRY._serialized_start = 11444 - _ARTIFACTSTATUSESRESPONSE_STATUSESENTRY._serialized_end = 11559 - _ARTIFACTSTATUSESRESPONSE_ARTIFACTSTATUS._serialized_start = 11561 - _ARTIFACTSTATUSESRESPONSE_ARTIFACTSTATUS._serialized_end = 11601 - _INTERRUPTREQUEST._serialized_start = 11604 - _INTERRUPTREQUEST._serialized_end = 12207 - _INTERRUPTREQUEST_INTERRUPTTYPE._serialized_start = 12007 - _INTERRUPTREQUEST_INTERRUPTTYPE._serialized_end = 12135 - _INTERRUPTRESPONSE._serialized_start = 12210 - _INTERRUPTRESPONSE._serialized_end = 12354 - _REATTACHOPTIONS._serialized_start = 12356 - _REATTACHOPTIONS._serialized_end = 12409 - _REATTACHEXECUTEREQUEST._serialized_start = 12412 - _REATTACHEXECUTEREQUEST._serialized_end = 12818 - _RELEASEEXECUTEREQUEST._serialized_start = 12821 - _RELEASEEXECUTEREQUEST._serialized_end = 13406 - _RELEASEEXECUTEREQUEST_RELEASEALL._serialized_start = 13275 - _RELEASEEXECUTEREQUEST_RELEASEALL._serialized_end = 13287 - _RELEASEEXECUTEREQUEST_RELEASEUNTIL._serialized_start = 13289 - _RELEASEEXECUTEREQUEST_RELEASEUNTIL._serialized_end = 13336 - _RELEASEEXECUTERESPONSE._serialized_start = 13409 - _RELEASEEXECUTERESPONSE._serialized_end = 13574 - _RELEASESESSIONREQUEST._serialized_start = 13577 - _RELEASESESSIONREQUEST._serialized_end = 13748 - _RELEASESESSIONRESPONSE._serialized_start = 13750 - _RELEASESESSIONRESPONSE._serialized_end = 13858 - _FETCHERRORDETAILSREQUEST._serialized_start = 13861 - _FETCHERRORDETAILSREQUEST._serialized_end = 14193 - _FETCHERRORDETAILSRESPONSE._serialized_start = 14196 - _FETCHERRORDETAILSRESPONSE._serialized_end = 15751 - _FETCHERRORDETAILSRESPONSE_STACKTRACEELEMENT._serialized_start = 14425 - _FETCHERRORDETAILSRESPONSE_STACKTRACEELEMENT._serialized_end = 14599 - _FETCHERRORDETAILSRESPONSE_QUERYCONTEXT._serialized_start = 14602 - _FETCHERRORDETAILSRESPONSE_QUERYCONTEXT._serialized_end = 14970 - _FETCHERRORDETAILSRESPONSE_QUERYCONTEXT_CONTEXTTYPE._serialized_start = 14933 - _FETCHERRORDETAILSRESPONSE_QUERYCONTEXT_CONTEXTTYPE._serialized_end = 14970 - _FETCHERRORDETAILSRESPONSE_SPARKTHROWABLE._serialized_start = 14973 - _FETCHERRORDETAILSRESPONSE_SPARKTHROWABLE._serialized_end = 15382 - _FETCHERRORDETAILSRESPONSE_SPARKTHROWABLE_MESSAGEPARAMETERSENTRY._serialized_start = 15284 - _FETCHERRORDETAILSRESPONSE_SPARKTHROWABLE_MESSAGEPARAMETERSENTRY._serialized_end = 15352 - _FETCHERRORDETAILSRESPONSE_ERROR._serialized_start = 15385 - _FETCHERRORDETAILSRESPONSE_ERROR._serialized_end = 15732 - _SPARKCONNECTSERVICE._serialized_start = 15754 - _SPARKCONNECTSERVICE._serialized_end = 16700 + _EXECUTEPLANRESPONSE_OBSERVEDMETRICS._serialized_start = 7744 + _EXECUTEPLANRESPONSE_OBSERVEDMETRICS._serialized_end = 7885 + _EXECUTEPLANRESPONSE_RESULTCOMPLETE._serialized_start = 7887 + _EXECUTEPLANRESPONSE_RESULTCOMPLETE._serialized_end = 7903 + _EXECUTEPLANRESPONSE_EXECUTIONPROGRESS._serialized_start = 7906 + _EXECUTEPLANRESPONSE_EXECUTIONPROGRESS._serialized_end = 8239 + _EXECUTEPLANRESPONSE_EXECUTIONPROGRESS_STAGEINFO._serialized_start = 8062 + _EXECUTEPLANRESPONSE_EXECUTIONPROGRESS_STAGEINFO._serialized_end = 8239 + _KEYVALUE._serialized_start = 8258 + _KEYVALUE._serialized_end = 8323 + _CONFIGREQUEST._serialized_start = 8326 + _CONFIGREQUEST._serialized_end = 9485 + _CONFIGREQUEST_OPERATION._serialized_start = 8634 + _CONFIGREQUEST_OPERATION._serialized_end = 9132 + _CONFIGREQUEST_SET._serialized_start = 9134 + _CONFIGREQUEST_SET._serialized_end = 9186 + _CONFIGREQUEST_GET._serialized_start = 9188 + _CONFIGREQUEST_GET._serialized_end = 9213 + _CONFIGREQUEST_GETWITHDEFAULT._serialized_start = 9215 + _CONFIGREQUEST_GETWITHDEFAULT._serialized_end = 9278 + _CONFIGREQUEST_GETOPTION._serialized_start = 9280 + _CONFIGREQUEST_GETOPTION._serialized_end = 9311 + _CONFIGREQUEST_GETALL._serialized_start = 9313 + _CONFIGREQUEST_GETALL._serialized_end = 9361 + _CONFIGREQUEST_UNSET._serialized_start = 9363 + _CONFIGREQUEST_UNSET._serialized_end = 9390 + _CONFIGREQUEST_ISMODIFIABLE._serialized_start = 9392 + _CONFIGREQUEST_ISMODIFIABLE._serialized_end = 9426 + _CONFIGRESPONSE._serialized_start = 9488 + _CONFIGRESPONSE._serialized_end = 9663 + _ADDARTIFACTSREQUEST._serialized_start = 9666 + _ADDARTIFACTSREQUEST._serialized_end = 10668 + _ADDARTIFACTSREQUEST_ARTIFACTCHUNK._serialized_start = 10141 + _ADDARTIFACTSREQUEST_ARTIFACTCHUNK._serialized_end = 10194 + _ADDARTIFACTSREQUEST_SINGLECHUNKARTIFACT._serialized_start = 10196 + _ADDARTIFACTSREQUEST_SINGLECHUNKARTIFACT._serialized_end = 10307 + _ADDARTIFACTSREQUEST_BATCH._serialized_start = 10309 + _ADDARTIFACTSREQUEST_BATCH._serialized_end = 10402 + _ADDARTIFACTSREQUEST_BEGINCHUNKEDARTIFACT._serialized_start = 10405 + _ADDARTIFACTSREQUEST_BEGINCHUNKEDARTIFACT._serialized_end = 10598 + _ADDARTIFACTSRESPONSE._serialized_start = 10671 + _ADDARTIFACTSRESPONSE._serialized_end = 10943 + _ADDARTIFACTSRESPONSE_ARTIFACTSUMMARY._serialized_start = 10862 + _ADDARTIFACTSRESPONSE_ARTIFACTSUMMARY._serialized_end = 10943 + _ARTIFACTSTATUSESREQUEST._serialized_start = 10946 + _ARTIFACTSTATUSESREQUEST._serialized_end = 11272 + _ARTIFACTSTATUSESRESPONSE._serialized_start = 11275 + _ARTIFACTSTATUSESRESPONSE._serialized_end = 11627 + _ARTIFACTSTATUSESRESPONSE_STATUSESENTRY._serialized_start = 11470 + _ARTIFACTSTATUSESRESPONSE_STATUSESENTRY._serialized_end = 11585 + _ARTIFACTSTATUSESRESPONSE_ARTIFACTSTATUS._serialized_start = 11587 + _ARTIFACTSTATUSESRESPONSE_ARTIFACTSTATUS._serialized_end = 11627 + _INTERRUPTREQUEST._serialized_start = 11630 + _INTERRUPTREQUEST._serialized_end = 12233 + _INTERRUPTREQUEST_INTERRUPTTYPE._serialized_start = 12033 + _INTERRUPTREQUEST_INTERRUPTTYPE._serialized_end = 12161 + _INTERRUPTRESPONSE._serialized_start = 12236 + _INTERRUPTRESPONSE._serialized_end = 12380 + _REATTACHOPTIONS._serialized_start = 12382 + _REATTACHOPTIONS._serialized_end = 12435 + _REATTACHEXECUTEREQUEST._serialized_start = 12438 + _REATTACHEXECUTEREQUEST._serialized_end = 12844 + _RELEASEEXECUTEREQUEST._serialized_start = 12847 + _RELEASEEXECUTEREQUEST._serialized_end = 13432 + _RELEASEEXECUTEREQUEST_RELEASEALL._serialized_start = 13301 + _RELEASEEXECUTEREQUEST_RELEASEALL._serialized_end = 13313 + _RELEASEEXECUTEREQUEST_RELEASEUNTIL._serialized_start = 13315 + _RELEASEEXECUTEREQUEST_RELEASEUNTIL._serialized_end = 13362 + _RELEASEEXECUTERESPONSE._serialized_start = 13435 + _RELEASEEXECUTERESPONSE._serialized_end = 13600 + _RELEASESESSIONREQUEST._serialized_start = 13603 + _RELEASESESSIONREQUEST._serialized_end = 13774 + _RELEASESESSIONRESPONSE._serialized_start = 13776 + _RELEASESESSIONRESPONSE._serialized_end = 13884 + _FETCHERRORDETAILSREQUEST._serialized_start = 13887 + _FETCHERRORDETAILSREQUEST._serialized_end = 14219 + _FETCHERRORDETAILSRESPONSE._serialized_start = 14222 + _FETCHERRORDETAILSRESPONSE._serialized_end = 15777 + _FETCHERRORDETAILSRESPONSE_STACKTRACEELEMENT._serialized_start = 14451 + _FETCHERRORDETAILSRESPONSE_STACKTRACEELEMENT._serialized_end = 14625 + _FETCHERRORDETAILSRESPONSE_QUERYCONTEXT._serialized_start = 14628 + _FETCHERRORDETAILSRESPONSE_QUERYCONTEXT._serialized_end = 14996 + _FETCHERRORDETAILSRESPONSE_QUERYCONTEXT_CONTEXTTYPE._serialized_start = 14959 + _FETCHERRORDETAILSRESPONSE_QUERYCONTEXT_CONTEXTTYPE._serialized_end = 14996 + _FETCHERRORDETAILSRESPONSE_SPARKTHROWABLE._serialized_start = 14999 + _FETCHERRORDETAILSRESPONSE_SPARKTHROWABLE._serialized_end = 15408 + _FETCHERRORDETAILSRESPONSE_SPARKTHROWABLE_MESSAGEPARAMETERSENTRY._serialized_start = 15310 + _FETCHERRORDETAILSRESPONSE_SPARKTHROWABLE_MESSAGEPARAMETERSENTRY._serialized_end = 15378 + _FETCHERRORDETAILSRESPONSE_ERROR._serialized_start = 15411 + _FETCHERRORDETAILSRESPONSE_ERROR._serialized_end = 15758 + _SPARKCONNECTSERVICE._serialized_start = 15780 + _SPARKCONNECTSERVICE._serialized_end = 16726 # @@protoc_insertion_point(module_scope) diff --git a/python/pyspark/sql/connect/proto/base_pb2.pyi b/python/pyspark/sql/connect/proto/base_pb2.pyi index d22502f8839db..b76f2a7f4de34 100644 --- a/python/pyspark/sql/connect/proto/base_pb2.pyi +++ b/python/pyspark/sql/connect/proto/base_pb2.pyi @@ -1406,6 +1406,7 @@ class ExecutePlanResponse(google.protobuf.message.Message): NAME_FIELD_NUMBER: builtins.int VALUES_FIELD_NUMBER: builtins.int KEYS_FIELD_NUMBER: builtins.int + PLAN_ID_FIELD_NUMBER: builtins.int name: builtins.str @property def values( @@ -1417,6 +1418,7 @@ class ExecutePlanResponse(google.protobuf.message.Message): def keys( self, ) -> google.protobuf.internal.containers.RepeatedScalarFieldContainer[builtins.str]: ... + plan_id: builtins.int def __init__( self, *, @@ -1426,11 +1428,12 @@ class ExecutePlanResponse(google.protobuf.message.Message): ] | None = ..., keys: collections.abc.Iterable[builtins.str] | None = ..., + plan_id: builtins.int = ..., ) -> None: ... def ClearField( self, field_name: typing_extensions.Literal[ - "keys", b"keys", "name", b"name", "values", b"values" + "keys", b"keys", "name", b"name", "plan_id", b"plan_id", "values", b"values" ], ) -> None: ... diff --git a/python/pyspark/sql/tests/connect/test_connect_function.py b/python/pyspark/sql/tests/connect/test_connect_function.py index 9d4db8cf7d15d..0f0abfd4b8567 100644 --- a/python/pyspark/sql/tests/connect/test_connect_function.py +++ b/python/pyspark/sql/tests/connect/test_connect_function.py @@ -21,7 +21,14 @@ from pyspark.util import is_remote_only from pyspark.errors import PySparkTypeError, PySparkValueError from pyspark.sql import SparkSession as PySparkSession -from pyspark.sql.types import StringType, StructType, StructField, ArrayType, IntegerType +from pyspark.sql.types import ( + _drop_metadata, + StringType, + StructType, + StructField, + ArrayType, + IntegerType, +) from pyspark.testing import assertDataFrameEqual from pyspark.testing.pandasutils import PandasOnSparkTestUtils from pyspark.testing.connectutils import ReusedConnectTestCase, should_test_connect @@ -1668,7 +1675,7 @@ def test_nested_lambda_function(self): ) # TODO: 'cdf.schema' has an extra metadata '{'__autoGeneratedAlias': 'true'}' - # self.assertEqual(cdf.schema, sdf.schema) + self.assertEqual(_drop_metadata(cdf.schema), _drop_metadata(sdf.schema)) self.assertEqual(cdf.collect(), sdf.collect()) def test_csv_functions(self): diff --git a/python/pyspark/sql/tests/connect/test_connect_plan.py b/python/pyspark/sql/tests/connect/test_connect_plan.py index 09c3171ee11fd..e8d04aeada740 100644 --- a/python/pyspark/sql/tests/connect/test_connect_plan.py +++ b/python/pyspark/sql/tests/connect/test_connect_plan.py @@ -443,7 +443,7 @@ def test_sample(self): self.assertEqual(plan.root.sample.lower_bound, 0.0) self.assertEqual(plan.root.sample.upper_bound, 0.3) self.assertEqual(plan.root.sample.with_replacement, False) - self.assertEqual(plan.root.sample.HasField("seed"), False) + self.assertEqual(plan.root.sample.HasField("seed"), True) self.assertEqual(plan.root.sample.deterministic_order, False) plan = ( diff --git a/python/pyspark/sql/tests/test_dataframe.py b/python/pyspark/sql/tests/test_dataframe.py index 16dd0d2a3bf7c..f491b496ddae5 100644 --- a/python/pyspark/sql/tests/test_dataframe.py +++ b/python/pyspark/sql/tests/test_dataframe.py @@ -430,6 +430,11 @@ def test_sample(self): IllegalArgumentException, lambda: self.spark.range(1).sample(-1.0).count() ) + def test_sample_with_random_seed(self): + df = self.spark.range(10000).sample(0.1) + cnts = [df.count() for i in range(10)] + self.assertEqual(1, len(set(cnts))) + def test_toDF_with_string(self): df = self.spark.createDataFrame([("John", 30), ("Alice", 25), ("Bob", 28)]) data = [("John", 30), ("Alice", 25), ("Bob", 28)] diff --git a/python/pyspark/sql/types.py b/python/pyspark/sql/types.py index 48aa3e8e4faba..41be12620fd56 100644 --- a/python/pyspark/sql/types.py +++ b/python/pyspark/sql/types.py @@ -1569,6 +1569,19 @@ def toJson(self, zone_id: str = "UTC") -> str: _INTERVAL_YEARMONTH = re.compile(r"interval (year|month)( to (year|month))?") +def _drop_metadata(d: Union[DataType, StructField]) -> Union[DataType, StructField]: + assert isinstance(d, (DataType, StructField)) + if isinstance(d, StructField): + return StructField(d.name, _drop_metadata(d.dataType), d.nullable, None) + elif isinstance(d, StructType): + return StructType([cast(StructField, _drop_metadata(f)) for f in d.fields]) + elif isinstance(d, ArrayType): + return ArrayType(_drop_metadata(d.elementType), d.containsNull) + elif isinstance(d, MapType): + return MapType(_drop_metadata(d.keyType), _drop_metadata(d.valueType), d.valueContainsNull) + return d + + def _parse_datatype_string(s: str) -> DataType: """ Parses the given data type string to a :class:`DataType`. The data type string format equals diff --git a/sql/api/src/main/scala/org/apache/spark/sql/ObservationBase.scala b/sql/api/src/main/scala/org/apache/spark/sql/ObservationBase.scala new file mode 100644 index 0000000000000..4789ae8975d12 --- /dev/null +++ b/sql/api/src/main/scala/org/apache/spark/sql/ObservationBase.scala @@ -0,0 +1,113 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql + +import scala.jdk.CollectionConverters.MapHasAsJava + +/** + * Helper class to simplify usage of `Dataset.observe(String, Column, Column*)`: + * + * {{{ + * // Observe row count (rows) and highest id (maxid) in the Dataset while writing it + * val observation = Observation("my metrics") + * val observed_ds = ds.observe(observation, count(lit(1)).as("rows"), max($"id").as("maxid")) + * observed_ds.write.parquet("ds.parquet") + * val metrics = observation.get + * }}} + * + * This collects the metrics while the first action is executed on the observed dataset. Subsequent + * actions do not modify the metrics returned by [[get]]. Retrieval of the metric via [[get]] + * blocks until the first action has finished and metrics become available. + * + * This class does not support streaming datasets. + * + * @param name name of the metric + * @since 3.3.0 + */ +abstract class ObservationBase(val name: String) { + + if (name.isEmpty) throw new IllegalArgumentException("Name must not be empty") + + @volatile protected var metrics: Option[Map[String, Any]] = None + + /** + * (Scala-specific) Get the observed metrics. This waits for the observed dataset to finish + * its first action. Only the result of the first action is available. Subsequent actions do not + * modify the result. + * + * @return the observed metrics as a `Map[String, Any]` + * @throws InterruptedException interrupted while waiting + */ + @throws[InterruptedException] + def get: Map[String, _] = { + synchronized { + // we need to loop as wait might return without us calling notify + // https://en.wikipedia.org/w/index.php?title=Spurious_wakeup&oldid=992601610 + while (this.metrics.isEmpty) { + wait() + } + } + + this.metrics.get + } + + /** + * (Java-specific) Get the observed metrics. This waits for the observed dataset to finish + * its first action. Only the result of the first action is available. Subsequent actions do not + * modify the result. + * + * @return the observed metrics as a `java.util.Map[String, Object]` + * @throws InterruptedException interrupted while waiting + */ + @throws[InterruptedException] + def getAsJava: java.util.Map[String, AnyRef] = { + get.map { case (key, value) => (key, value.asInstanceOf[Object]) }.asJava + } + + /** + * Get the observed metrics. This returns the metrics if they are available, otherwise an empty. + * + * @return the observed metrics as a `Map[String, Any]` + */ + @throws[InterruptedException] + private[sql] def getOrEmpty: Map[String, _] = { + synchronized { + if (metrics.isEmpty) { + wait(100) // Wait for 100ms to see if metrics are available + } + metrics.getOrElse(Map.empty) + } + } + + /** + * Set the observed metrics and notify all waiting threads to resume. + * + * @return `true` if all waiting threads were notified, `false` if otherwise. + */ + private[spark] def setMetricsAndNotify(metrics: Option[Map[String, Any]]): Boolean = { + synchronized { + this.metrics = metrics + if(metrics.isDefined) { + notifyAll() + true + } else { + false + } + } + } +} diff --git a/sql/api/src/main/scala/org/apache/spark/sql/catalyst/util/SparkDateTimeUtils.scala b/sql/api/src/main/scala/org/apache/spark/sql/catalyst/util/SparkDateTimeUtils.scala index 8db8c3cd39d74..0447d813e26a5 100644 --- a/sql/api/src/main/scala/org/apache/spark/sql/catalyst/util/SparkDateTimeUtils.scala +++ b/sql/api/src/main/scala/org/apache/spark/sql/catalyst/util/SparkDateTimeUtils.scala @@ -198,7 +198,7 @@ trait SparkDateTimeUtils { } private val zoneInfoClassName = "sun.util.calendar.ZoneInfo" - private val getOffsetsByWallHandle = { + private lazy val getOffsetsByWallHandle = { val lookup = MethodHandles.lookup() val classType = SparkClassUtils.classForName(zoneInfoClassName) val methodName = "getOffsetsByWall" diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeCreator.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeCreator.scala index c38b6cea9a0a5..4c0d005340606 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeCreator.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeCreator.scala @@ -374,7 +374,8 @@ object CreateStruct { // We should always use the last part of the column name (`c` in the above example) as the // alias name inside CreateNamedStruct. case (u: UnresolvedAttribute, _) => Seq(Literal(u.nameParts.last), u) - case (u @ UnresolvedExtractValue(_, e: Literal), _) if e.dataType == StringType => Seq(e, u) + case (u @ UnresolvedExtractValue(_, e: Literal), _) if e.dataType.isInstanceOf[StringType] => + Seq(e, u) case (a: Alias, _) => Seq(Literal(a.name), a) case (e: NamedExpression, _) if e.resolved => Seq(Literal(e.name), e) case (e: NamedExpression, _) => Seq(NamePlaceholder, e) @@ -465,7 +466,7 @@ case class CreateNamedStruct(children: Seq[Expression]) extends Expression with toSQLId(prettyName), Seq("2n (n > 0)"), children.length ) } else { - val invalidNames = nameExprs.filterNot(e => e.foldable && e.dataType == StringType) + val invalidNames = nameExprs.filterNot(e => e.foldable && e.dataType.isInstanceOf[StringType]) if (invalidNames.nonEmpty) { DataTypeMismatch( errorSubClass = "CREATE_NAMED_STRUCT_WITHOUT_FOLDABLE_STRING", diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala index 35e30ceb45cb5..8258bb389e2da 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala @@ -37,6 +37,7 @@ import org.apache.spark.sql.catalyst.trees.TreePattern.{JSON_TO_STRUCT, TreePatt import org.apache.spark.sql.catalyst.util._ import org.apache.spark.sql.errors.{QueryCompilationErrors, QueryErrorsBase} import org.apache.spark.sql.internal.SQLConf +import org.apache.spark.sql.internal.types.StringTypeAnyCollation import org.apache.spark.sql.types._ import org.apache.spark.unsafe.types.{UTF8String, VariantVal} import org.apache.spark.util.Utils @@ -132,8 +133,9 @@ case class GetJsonObject(json: Expression, path: Expression) override def left: Expression = json override def right: Expression = path - override def inputTypes: Seq[DataType] = Seq(StringType, StringType) - override def dataType: DataType = StringType + override def inputTypes: Seq[AbstractDataType] = + Seq(StringTypeAnyCollation, StringTypeAnyCollation) + override def dataType: DataType = SQLConf.get.defaultStringType override def nullable: Boolean = true override def prettyName: String = "get_json_object" @@ -477,7 +479,7 @@ case class JsonTuple(children: Seq[Expression]) @transient private lazy val constantFields: Int = foldableFieldNames.count(_ != null) override def elementSchema: StructType = StructType(fieldExpressions.zipWithIndex.map { - case (_, idx) => StructField(s"c$idx", StringType, nullable = true) + case (_, idx) => StructField(s"c$idx", children.head.dataType, nullable = true) }) override def prettyName: String = "json_tuple" @@ -487,7 +489,7 @@ case class JsonTuple(children: Seq[Expression]) throw QueryCompilationErrors.wrongNumArgsError( toSQLId(prettyName), Seq("> 1"), children.length ) - } else if (children.forall(child => StringType.acceptsType(child.dataType))) { + } else if (children.forall(child => StringTypeAnyCollation.acceptsType(child.dataType))) { TypeCheckResult.TypeCheckSuccess } else { DataTypeMismatch( @@ -722,7 +724,7 @@ case class JsonToStructs( converter(parser.parse(json.asInstanceOf[UTF8String])) } - override def inputTypes: Seq[AbstractDataType] = StringType :: Nil + override def inputTypes: Seq[AbstractDataType] = StringTypeAnyCollation :: Nil override def sql: String = schema match { case _: MapType => "entries" @@ -824,7 +826,7 @@ case class StructsToJson( } } - override def dataType: DataType = StringType + override def dataType: DataType = SQLConf.get.defaultStringType override def checkInputDataTypes(): TypeCheckResult = inputSchema match { case dt @ (_: StructType | _: MapType | _: ArrayType | _: VariantType) => @@ -957,7 +959,7 @@ case class SchemaOfJson( case class LengthOfJsonArray(child: Expression) extends UnaryExpression with CodegenFallback with ExpectsInputTypes { - override def inputTypes: Seq[DataType] = Seq(StringType) + override def inputTypes: Seq[AbstractDataType] = Seq(StringTypeAnyCollation) override def dataType: DataType = IntegerType override def nullable: Boolean = true override def prettyName: String = "json_array_length" @@ -1030,8 +1032,8 @@ case class LengthOfJsonArray(child: Expression) extends UnaryExpression case class JsonObjectKeys(child: Expression) extends UnaryExpression with CodegenFallback with ExpectsInputTypes { - override def inputTypes: Seq[DataType] = Seq(StringType) - override def dataType: DataType = ArrayType(StringType) + override def inputTypes: Seq[AbstractDataType] = Seq(StringTypeAnyCollation) + override def dataType: DataType = ArrayType(SQLConf.get.defaultStringType) override def nullable: Boolean = true override def prettyName: String = "json_object_keys" diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala index c2ea17de19533..0bdd7930b0bf9 100755 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala @@ -2646,7 +2646,7 @@ object Decode { arguments = """ Arguments: * bin - a binary expression to decode - * charset - one of the charsets 'US-ASCII', 'ISO-8859-1', 'UTF-8', 'UTF-16BE', 'UTF-16LE', 'UTF-16' to decode `bin` into a STRING. It is case insensitive. + * charset - one of the charsets 'US-ASCII', 'ISO-8859-1', 'UTF-8', 'UTF-16BE', 'UTF-16LE', 'UTF-16', 'UTF-32' to decode `bin` into a STRING. It is case insensitive. """, examples = """ Examples: @@ -2690,7 +2690,7 @@ case class Decode(params: Seq[Expression], replacement: Expression) arguments = """ Arguments: * bin - a binary expression to decode - * charset - one of the charsets 'US-ASCII', 'ISO-8859-1', 'UTF-8', 'UTF-16BE', 'UTF-16LE', 'UTF-16' to decode `bin` into a STRING. It is case insensitive. + * charset - one of the charsets 'US-ASCII', 'ISO-8859-1', 'UTF-8', 'UTF-16BE', 'UTF-16LE', 'UTF-16', 'UTF-32' to decode `bin` into a STRING. It is case insensitive. """, since = "1.5.0", group = "string_funcs") @@ -2707,7 +2707,7 @@ case class StringDecode(bin: Expression, charset: Expression, legacyCharsets: Bo override def inputTypes: Seq[AbstractDataType] = Seq(BinaryType, StringTypeAnyCollation) private val supportedCharsets = Set( - "US-ASCII", "ISO-8859-1", "UTF-8", "UTF-16BE", "UTF-16LE", "UTF-16") + "US-ASCII", "ISO-8859-1", "UTF-8", "UTF-16BE", "UTF-16LE", "UTF-16", "UTF-32") protected override def nullSafeEval(input1: Any, input2: Any): Any = { val fromCharset = input2.asInstanceOf[UTF8String].toString @@ -2762,7 +2762,7 @@ object StringDecode { arguments = """ Arguments: * str - a string expression - * charset - one of the charsets 'US-ASCII', 'ISO-8859-1', 'UTF-8', 'UTF-16BE', 'UTF-16LE', 'UTF-16' to encode `str` into a BINARY. It is case insensitive. + * charset - one of the charsets 'US-ASCII', 'ISO-8859-1', 'UTF-8', 'UTF-16BE', 'UTF-16LE', 'UTF-16', 'UTF-32' to encode `str` into a BINARY. It is case insensitive. """, examples = """ Examples: @@ -2785,7 +2785,7 @@ case class Encode(str: Expression, charset: Expression, legacyCharsets: Boolean) Seq(StringTypeAnyCollation, StringTypeAnyCollation) private val supportedCharsets = Set( - "US-ASCII", "ISO-8859-1", "UTF-8", "UTF-16BE", "UTF-16LE", "UTF-16") + "US-ASCII", "ISO-8859-1", "UTF-8", "UTF-16BE", "UTF-16LE", "UTF-16", "UTF-32") protected override def nullSafeEval(input1: Any, input2: Any): Any = { val toCharset = input2.asInstanceOf[UTF8String].toString diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JacksonParser.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JacksonParser.scala index 3c42f72fa6b6c..848c20ee36bef 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JacksonParser.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JacksonParser.scala @@ -275,7 +275,7 @@ class JacksonParser( } } - case StringType => + case _: StringType => (parser: JsonParser) => parseJsonToken[UTF8String](parser, dataType) { case VALUE_STRING => UTF8String.fromString(parser.getText) diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringExpressionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringExpressionsSuite.scala index 0fcceef392389..51de44d8dfd98 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringExpressionsSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringExpressionsSuite.scala @@ -489,6 +489,8 @@ class StringExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper { // non ascii characters are not allowed in the code, so we disable the scalastyle here. checkEvaluation( StringDecode(Encode(Literal("大千世界"), Literal("UTF-16LE")), Literal("UTF-16LE")), "大千世界") + checkEvaluation( + StringDecode(Encode(Literal("大千世界"), Literal("UTF-32")), Literal("UTF-32")), "大千世界") checkEvaluation( StringDecode(Encode(a, Literal("utf-8")), Literal("utf-8")), "大千世界", create_row("大千世界")) checkEvaluation( diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataSourceRegistration.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataSourceRegistration.scala index 63cee8861c5a4..8ffdbb952b082 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/DataSourceRegistration.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/DataSourceRegistration.scala @@ -30,7 +30,7 @@ import org.apache.spark.sql.internal.SQLConf * Use `SparkSession.dataSource` to access this. */ @Evolving -private[sql] class DataSourceRegistration private[sql] (dataSourceManager: DataSourceManager) +class DataSourceRegistration private[sql] (dataSourceManager: DataSourceManager) extends Logging { protected[sql] def registerPython( diff --git a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala index 18c9704afdf83..3e843e64ebbf6 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala @@ -3904,8 +3904,7 @@ class Dataset[T] private[sql]( * @since 1.6.0 */ def unpersist(blocking: Boolean): this.type = { - sparkSession.sharedState.cacheManager.uncacheQuery( - sparkSession, logicalPlan, cascade = false, blocking) + sparkSession.sharedState.cacheManager.uncacheQuery(this, cascade = false, blocking) this } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/Observation.scala b/sql/core/src/main/scala/org/apache/spark/sql/Observation.scala index 104e7c101fd1c..30d5943c60922 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/Observation.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/Observation.scala @@ -19,8 +19,6 @@ package org.apache.spark.sql import java.util.UUID -import scala.jdk.CollectionConverters.MapHasAsJava - import org.apache.spark.sql.catalyst.plans.logical.CollectMetrics import org.apache.spark.sql.execution.QueryExecution import org.apache.spark.sql.util.QueryExecutionListener @@ -47,9 +45,7 @@ import org.apache.spark.util.ArrayImplicits._ * @param name name of the metric * @since 3.3.0 */ -class Observation(val name: String) { - - if (name.isEmpty) throw new IllegalArgumentException("Name must not be empty") +class Observation(name: String) extends ObservationBase(name) { /** * Create an Observation instance without providing a name. This generates a random name. @@ -60,8 +56,6 @@ class Observation(val name: String) { @volatile private var dataframeId: Option[(SparkSession, Long)] = None - @volatile private var metrics: Option[Map[String, Any]] = None - /** * Attach this observation to the given [[Dataset]] to observe aggregation expressions. * @@ -83,55 +77,6 @@ class Observation(val name: String) { ds.observe(name, expr, exprs: _*) } - /** - * (Scala-specific) Get the observed metrics. This waits for the observed dataset to finish - * its first action. Only the result of the first action is available. Subsequent actions do not - * modify the result. - * - * @return the observed metrics as a `Map[String, Any]` - * @throws InterruptedException interrupted while waiting - */ - @throws[InterruptedException] - def get: Map[String, _] = { - synchronized { - // we need to loop as wait might return without us calling notify - // https://en.wikipedia.org/w/index.php?title=Spurious_wakeup&oldid=992601610 - while (this.metrics.isEmpty) { - wait() - } - } - - this.metrics.get - } - - /** - * (Java-specific) Get the observed metrics. This waits for the observed dataset to finish - * its first action. Only the result of the first action is available. Subsequent actions do not - * modify the result. - * - * @return the observed metrics as a `java.util.Map[String, Object]` - * @throws InterruptedException interrupted while waiting - */ - @throws[InterruptedException] - def getAsJava: java.util.Map[String, AnyRef] = { - get.map { case (key, value) => (key, value.asInstanceOf[Object])}.asJava - } - - /** - * Get the observed metrics. This returns the metrics if they are available, otherwise an empty. - * - * @return the observed metrics as a `Map[String, Any]` - */ - @throws[InterruptedException] - private[sql] def getOrEmpty: Map[String, _] = { - synchronized { - if (metrics.isEmpty) { - wait(100) // Wait for 100ms to see if metrics are available - } - metrics.getOrElse(Map.empty) - } - } - private[sql] def register(sparkSession: SparkSession, dataframeId: Long): Unit = { // makes this class thread-safe: // only the first thread entering this block can set sparkSession @@ -158,9 +103,8 @@ class Observation(val name: String) { case _ => false }) { val row = qe.observedMetrics.get(name) - this.metrics = row.map(r => r.getValuesMap[Any](r.schema.fieldNames.toImmutableArraySeq)) - if (metrics.isDefined) { - notifyAll() + val metrics = row.map(r => r.getValuesMap[Any](r.schema.fieldNames.toImmutableArraySeq)) + if (setMetricsAndNotify(metrics)) { unregister() } } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala b/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala index 5d85f070fbbe4..d5de74455dceb 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala @@ -234,7 +234,7 @@ class SparkSession private( /** * A collection of methods for registering user-defined data sources. */ - private[sql] def dataSource: DataSourceRegistration = sessionState.dataSourceRegistration + def dataSource: DataSourceRegistration = sessionState.dataSourceRegistration /** * Returns a `StreamingQueryManager` that allows managing all the diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/CacheManager.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/CacheManager.scala index ae99873a9f774..b96f257e6b5b6 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/CacheManager.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/CacheManager.scala @@ -25,7 +25,7 @@ import org.apache.spark.sql.{Dataset, SparkSession} import org.apache.spark.sql.catalyst.catalog.HiveTableRelation import org.apache.spark.sql.catalyst.expressions.{Attribute, SubqueryExpression} import org.apache.spark.sql.catalyst.optimizer.EliminateResolvedHint -import org.apache.spark.sql.catalyst.plans.logical.{IgnoreCachedData, LogicalPlan, ResolvedHint, SubqueryAlias, View} +import org.apache.spark.sql.catalyst.plans.logical.{IgnoreCachedData, LogicalPlan, ResolvedHint, View} import org.apache.spark.sql.catalyst.trees.TreePattern.PLAN_EXPRESSION import org.apache.spark.sql.catalyst.util.sideBySide import org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanHelper @@ -38,7 +38,10 @@ import org.apache.spark.storage.StorageLevel import org.apache.spark.storage.StorageLevel.MEMORY_AND_DISK /** Holds a cached logical plan and its data */ -case class CachedData(plan: LogicalPlan, cachedRepresentation: InMemoryRelation) { +case class CachedData( + // A normalized resolved plan (See QueryExecution#normalized). + plan: LogicalPlan, + cachedRepresentation: InMemoryRelation) { override def toString: String = s""" |CachedData( @@ -53,7 +56,9 @@ case class CachedData(plan: LogicalPlan, cachedRepresentation: InMemoryRelation) * InMemoryRelation. This relation is automatically substituted query plans that return the * `sameResult` as the originally cached query. * - * Internal to Spark SQL. + * Internal to Spark SQL. All its public APIs take analyzed plans and will normalize them before + * further usage, or take [[Dataset]] and get its normalized plan. See `QueryExecution.normalize` + * for more details about plan normalization. */ class CacheManager extends Logging with AdaptiveSparkPlanHelper { @@ -77,41 +82,43 @@ class CacheManager extends Logging with AdaptiveSparkPlanHelper { cachedData.isEmpty } + // Test-only + def cacheQuery(query: Dataset[_]): Unit = { + cacheQuery(query, tableName = None, storageLevel = MEMORY_AND_DISK) + } + /** * Caches the data produced by the logical representation of the given [[Dataset]]. - * Unlike `RDD.cache()`, the default storage level is set to be `MEMORY_AND_DISK` because - * recomputing the in-memory columnar representation of the underlying table is expensive. */ def cacheQuery( query: Dataset[_], - tableName: Option[String] = None, - storageLevel: StorageLevel = MEMORY_AND_DISK): Unit = { - cacheQuery(query.sparkSession, query.queryExecution.normalized, tableName, storageLevel) + tableName: Option[String], + storageLevel: StorageLevel): Unit = { + cacheQueryInternal(query.sparkSession, query.queryExecution.normalized, tableName, storageLevel) } /** - * Caches the data produced by the given [[LogicalPlan]]. - * Unlike `RDD.cache()`, the default storage level is set to be `MEMORY_AND_DISK` because - * recomputing the in-memory columnar representation of the underlying table is expensive. + * Caches the data produced by the given [[LogicalPlan]]. The given plan will be normalized + * before being used further. */ def cacheQuery( spark: SparkSession, planToCache: LogicalPlan, - tableName: Option[String]): Unit = { - cacheQuery(spark, planToCache, tableName, MEMORY_AND_DISK) + tableName: Option[String], + storageLevel: StorageLevel): Unit = { + val normalized = QueryExecution.normalize(spark, planToCache) + cacheQueryInternal(spark, normalized, tableName, storageLevel) } - /** - * Caches the data produced by the given [[LogicalPlan]]. - */ - def cacheQuery( + // The `planToCache` should have been normalized. + private def cacheQueryInternal( spark: SparkSession, planToCache: LogicalPlan, tableName: Option[String], storageLevel: StorageLevel): Unit = { if (storageLevel == StorageLevel.NONE) { // Do nothing for StorageLevel.NONE since it will not actually cache any data. - } else if (lookupCachedData(planToCache).nonEmpty) { + } else if (lookupCachedDataInternal(planToCache).nonEmpty) { logWarning("Asked to cache already cached data.") } else { val sessionWithConfigsOff = getOrCloneSessionWithConfigsOff(spark) @@ -124,7 +131,7 @@ class CacheManager extends Logging with AdaptiveSparkPlanHelper { } this.synchronized { - if (lookupCachedData(planToCache).nonEmpty) { + if (lookupCachedDataInternal(planToCache).nonEmpty) { logWarning("Data has already been cached.") } else { val cd = CachedData(planToCache, inMemoryRelation) @@ -138,38 +145,64 @@ class CacheManager extends Logging with AdaptiveSparkPlanHelper { /** * Un-cache the given plan or all the cache entries that refer to the given plan. - * @param query The [[Dataset]] to be un-cached. - * @param cascade If true, un-cache all the cache entries that refer to the given - * [[Dataset]]; otherwise un-cache the given [[Dataset]] only. + * + * @param query The [[Dataset]] to be un-cached. + * @param cascade If true, un-cache all the cache entries that refer to the given + * [[Dataset]]; otherwise un-cache the given [[Dataset]] only. + * @param blocking Whether to block until all blocks are deleted. */ + def uncacheQuery( + query: Dataset[_], + cascade: Boolean, + blocking: Boolean): Unit = { + uncacheQueryInternal(query.sparkSession, query.queryExecution.normalized, cascade, blocking) + } + + // An overload to provide default value for the `blocking` parameter. def uncacheQuery( query: Dataset[_], cascade: Boolean): Unit = { - uncacheQuery(query.sparkSession, query.queryExecution.normalized, cascade) + uncacheQuery(query, cascade, blocking = false) } /** * Un-cache the given plan or all the cache entries that refer to the given plan. - * @param spark The Spark session. - * @param plan The plan to be un-cached. - * @param cascade If true, un-cache all the cache entries that refer to the given - * plan; otherwise un-cache the given plan only. - * @param blocking Whether to block until all blocks are deleted. + * + * @param spark The Spark session. + * @param plan The plan to be un-cached. + * @param cascade If true, un-cache all the cache entries that refer to the given + * plan; otherwise un-cache the given plan only. + * @param blocking Whether to block until all blocks are deleted. */ def uncacheQuery( spark: SparkSession, plan: LogicalPlan, cascade: Boolean, - blocking: Boolean = false): Unit = { - uncacheQuery(spark, _.sameResult(plan), cascade, blocking) + blocking: Boolean): Unit = { + val normalized = QueryExecution.normalize(spark, plan) + uncacheQueryInternal(spark, normalized, cascade, blocking) + } + + // An overload to provide default value for the `blocking` parameter. + def uncacheQuery( + spark: SparkSession, + plan: LogicalPlan, + cascade: Boolean): Unit = { + uncacheQuery(spark, plan, cascade, blocking = false) + } + + // The `plan` should have been normalized. + private def uncacheQueryInternal( + spark: SparkSession, + plan: LogicalPlan, + cascade: Boolean, + blocking: Boolean): Unit = { + uncacheByCondition(spark, _.sameResult(plan), cascade, blocking) } def uncacheTableOrView(spark: SparkSession, name: Seq[String], cascade: Boolean): Unit = { - uncacheQuery( - spark, - isMatchedTableOrView(_, name, spark.sessionState.conf), - cascade, - blocking = false) + uncacheByCondition( + spark, isMatchedTableOrView(_, name, spark.sessionState.conf), cascade, blocking = false) } private def isMatchedTableOrView(plan: LogicalPlan, name: Seq[String], conf: SQLConf): Boolean = { @@ -178,28 +211,24 @@ class CacheManager extends Logging with AdaptiveSparkPlanHelper { } plan match { - case SubqueryAlias(ident, LogicalRelation(_, _, Some(catalogTable), _)) => - val v1Ident = catalogTable.identifier - isSameName(ident.qualifier :+ ident.name) && isSameName(v1Ident.nameParts) + case LogicalRelation(_, _, Some(catalogTable), _) => + isSameName(catalogTable.identifier.nameParts) - case SubqueryAlias(ident, DataSourceV2Relation(_, _, Some(catalog), Some(v2Ident), _)) => + case DataSourceV2Relation(_, _, Some(catalog), Some(v2Ident), _) => import org.apache.spark.sql.connector.catalog.CatalogV2Implicits.IdentifierHelper - isSameName(ident.qualifier :+ ident.name) && - isSameName(v2Ident.toQualifiedNameParts(catalog)) + isSameName(v2Ident.toQualifiedNameParts(catalog)) - case SubqueryAlias(ident, View(catalogTable, _, _)) => - val v1Ident = catalogTable.identifier - isSameName(ident.qualifier :+ ident.name) && isSameName(v1Ident.nameParts) + case View(catalogTable, _, _) => + isSameName(catalogTable.identifier.nameParts) - case SubqueryAlias(ident, HiveTableRelation(catalogTable, _, _, _, _)) => - val v1Ident = catalogTable.identifier - isSameName(ident.qualifier :+ ident.name) && isSameName(v1Ident.nameParts) + case HiveTableRelation(catalogTable, _, _, _, _) => + isSameName(catalogTable.identifier.nameParts) case _ => false } } - def uncacheQuery( + private def uncacheByCondition( spark: SparkSession, isMatchedPlan: LogicalPlan => Boolean, cascade: Boolean, @@ -252,10 +281,12 @@ class CacheManager extends Logging with AdaptiveSparkPlanHelper { } /** - * Tries to re-cache all the cache entries that refer to the given plan. + * Tries to re-cache all the cache entries that refer to the given plan. The given plan will be + * normalized before being used further. */ def recacheByPlan(spark: SparkSession, plan: LogicalPlan): Unit = { - recacheByCondition(spark, _.plan.exists(_.sameResult(plan))) + val normalized = QueryExecution.normalize(spark, plan) + recacheByCondition(spark, _.plan.exists(_.sameResult(normalized))) } /** @@ -278,7 +309,7 @@ class CacheManager extends Logging with AdaptiveSparkPlanHelper { } val recomputedPlan = cd.copy(cachedRepresentation = newCache) this.synchronized { - if (lookupCachedData(recomputedPlan.plan).nonEmpty) { + if (lookupCachedDataInternal(recomputedPlan.plan).nonEmpty) { logWarning("While recaching, data was already added to cache.") } else { cachedData = recomputedPlan +: cachedData @@ -289,13 +320,23 @@ class CacheManager extends Logging with AdaptiveSparkPlanHelper { } } - /** Optionally returns cached data for the given [[Dataset]] */ + /** + * Optionally returns cached data for the given [[Dataset]] + */ def lookupCachedData(query: Dataset[_]): Option[CachedData] = { - lookupCachedData(query.queryExecution.normalized) + lookupCachedDataInternal(query.queryExecution.normalized) } - /** Optionally returns cached data for the given [[LogicalPlan]]. */ - def lookupCachedData(plan: LogicalPlan): Option[CachedData] = { + /** + * Optionally returns cached data for the given [[LogicalPlan]]. The given plan will be normalized + * before being used further. + */ + def lookupCachedData(session: SparkSession, plan: LogicalPlan): Option[CachedData] = { + val normalized = QueryExecution.normalize(session, plan) + lookupCachedDataInternal(normalized) + } + + private def lookupCachedDataInternal(plan: LogicalPlan): Option[CachedData] = { val result = cachedData.find(cd => plan.sameResult(cd.plan)) if (result.isDefined) { CacheManager.logCacheOperation(log"Dataframe cache hit for input plan:" + @@ -305,13 +346,16 @@ class CacheManager extends Logging with AdaptiveSparkPlanHelper { result } - /** Replaces segments of the given logical plan with cached versions where possible. */ - def useCachedData(plan: LogicalPlan): LogicalPlan = { + /** + * Replaces segments of the given logical plan with cached versions where possible. The input + * plan must be normalized. + */ + private[sql] def useCachedData(plan: LogicalPlan): LogicalPlan = { val newPlan = plan transformDown { case command: IgnoreCachedData => command case currentFragment => - lookupCachedData(currentFragment).map { cached => + lookupCachedDataInternal(currentFragment).map { cached => // After cache lookup, we should still keep the hints from the input plan. val hints = EliminateResolvedHint.extractHintsFromPlan(currentFragment)._2 val cachedPlan = cached.cachedRepresentation.withOutput(currentFragment.output) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/QueryExecution.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/QueryExecution.scala index d04d8dc2cd7fd..357484ca19df2 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/QueryExecution.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/QueryExecution.scala @@ -134,19 +134,7 @@ class QueryExecution( // The plan that has been normalized by custom rules, so that it's more likely to hit cache. lazy val normalized: LogicalPlan = { - val normalizationRules = sparkSession.sessionState.planNormalizationRules - if (normalizationRules.isEmpty) { - commandExecuted - } else { - val planChangeLogger = new PlanChangeLogger[LogicalPlan]() - val normalized = normalizationRules.foldLeft(commandExecuted) { (p, rule) => - val result = rule.apply(p) - planChangeLogger.logRule(rule.ruleName, p, result) - result - } - planChangeLogger.logBatch("Plan Normalization", commandExecuted, normalized) - normalized - } + QueryExecution.normalize(sparkSession, commandExecuted, Some(tracker)) } lazy val withCachedData: LogicalPlan = sparkSession.withActive { @@ -613,4 +601,27 @@ object QueryExecution { case e: Throwable => throw toInternalError(msg, e) } } + + def normalize( + session: SparkSession, + plan: LogicalPlan, + tracker: Option[QueryPlanningTracker] = None): LogicalPlan = { + val normalizationRules = session.sessionState.planNormalizationRules + if (normalizationRules.isEmpty) { + plan + } else { + val planChangeLogger = new PlanChangeLogger[LogicalPlan]() + val normalized = normalizationRules.foldLeft(plan) { (p, rule) => + val startTime = System.nanoTime() + val result = rule.apply(p) + val runTime = System.nanoTime() - startTime + val effective = !result.fastEquals(p) + tracker.foreach(_.recordRuleInvocation(rule.ruleName, runTime, effective)) + planChangeLogger.logRule(rule.ruleName, p, result) + result + } + planChangeLogger.logBatch("Plan Normalization", plan, normalized) + normalized + } + } } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/AnalyzeColumnCommand.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/AnalyzeColumnCommand.scala index 299f41eb55e17..7b0ce3e59263f 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/AnalyzeColumnCommand.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/AnalyzeColumnCommand.scala @@ -61,8 +61,8 @@ case class AnalyzeColumnCommand( private def analyzeColumnInCachedData(plan: LogicalPlan, sparkSession: SparkSession): Boolean = { val cacheManager = sparkSession.sharedState.cacheManager - val planToLookup = sparkSession.sessionState.executePlan(plan).analyzed - cacheManager.lookupCachedData(planToLookup).map { cachedData => + val df = Dataset.ofRows(sparkSession, plan) + cacheManager.lookupCachedData(df).map { cachedData => val columnsToAnalyze = getColumnsToAnalyze( tableIdent, cachedData.cachedRepresentation, columnNames, allColumns) cacheManager.analyzeColumnCacheQuery(sparkSession, cachedData, columnsToAnalyze) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/CommandUtils.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/CommandUtils.scala index d7c5df151bf12..7acd1cb0852b9 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/CommandUtils.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/CommandUtils.scala @@ -240,7 +240,7 @@ object CommandUtils extends Logging { // Analyzes a catalog view if the view is cached val table = sparkSession.table(tableIdent.quotedString) val cacheManager = sparkSession.sharedState.cacheManager - if (cacheManager.lookupCachedData(table.logicalPlan).isDefined) { + if (cacheManager.lookupCachedData(table).isDefined) { if (!noScan) { // To collect table stats, materializes an underlying columnar RDD table.count() diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/CacheTableExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/CacheTableExec.scala index fc8a40f885450..56c44a1256815 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/CacheTableExec.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/CacheTableExec.scala @@ -21,9 +21,9 @@ import java.util.Locale import org.apache.spark.internal.LogKeys.OPTIONS import org.apache.spark.internal.MDC -import org.apache.spark.sql.{DataFrame, Dataset} +import org.apache.spark.sql.Dataset import org.apache.spark.sql.catalyst.{InternalRow, TableIdentifier} -import org.apache.spark.sql.catalyst.analysis.LocalTempView +import org.apache.spark.sql.catalyst.analysis.{LocalTempView, UnresolvedRelation} import org.apache.spark.sql.catalyst.expressions.Attribute import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan import org.apache.spark.sql.catalyst.util.CaseInsensitiveMap @@ -34,7 +34,6 @@ import org.apache.spark.storage.StorageLevel trait BaseCacheTableExec extends LeafV2CommandExec { def relationName: String def planToCache: LogicalPlan - def dataFrameForCachedPlan: DataFrame def isLazy: Boolean def options: Map[String, String] @@ -49,15 +48,12 @@ trait BaseCacheTableExec extends LeafV2CommandExec { logWarning(log"Invalid options: ${MDC(OPTIONS, withoutStorageLevel.mkString(", "))}") } - session.sharedState.cacheManager.cacheQuery( - session, - planToCache, - Some(relationName), - storageLevel) + val df = Dataset.ofRows(session, planToCache) + session.sharedState.cacheManager.cacheQuery(df, Some(relationName), storageLevel) if (!isLazy) { // Performs eager caching. - dataFrameForCachedPlan.count() + df.count() } Seq.empty @@ -74,10 +70,6 @@ case class CacheTableExec( override lazy val relationName: String = multipartIdentifier.quoted override lazy val planToCache: LogicalPlan = relation - - override lazy val dataFrameForCachedPlan: DataFrame = { - Dataset.ofRows(session, planToCache) - } } case class CacheTableAsSelectExec( @@ -89,7 +81,10 @@ case class CacheTableAsSelectExec( referredTempFunctions: Seq[String]) extends BaseCacheTableExec { override lazy val relationName: String = tempViewName - override lazy val planToCache: LogicalPlan = { + override def planToCache: LogicalPlan = UnresolvedRelation(Seq(tempViewName)) + + override def run(): Seq[InternalRow] = { + // CACHE TABLE AS TABLE creates a temp view and caches the temp view. CreateViewCommand( name = TableIdentifier(tempViewName), userSpecifiedColumns = Nil, @@ -103,12 +98,7 @@ case class CacheTableAsSelectExec( isAnalyzed = true, referredTempFunctions = referredTempFunctions ).run(session) - - dataFrameForCachedPlan.logicalPlan - } - - override lazy val dataFrameForCachedPlan: DataFrame = { - session.table(tempViewName) + super.run() } } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2Strategy.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2Strategy.scala index 0d926dcd99c4a..7a668b75c3c73 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2Strategy.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2Strategy.scala @@ -83,7 +83,7 @@ class DataSourceV2Strategy(session: SparkSession) extends Strategy with Predicat // given table, the cache's storage level is returned. private def invalidateTableCache(r: ResolvedTable)(): Option[StorageLevel] = { val v2Relation = DataSourceV2Relation.create(r.table, Some(r.catalog), Some(r.identifier)) - val cache = session.sharedState.cacheManager.lookupCachedData(v2Relation) + val cache = session.sharedState.cacheManager.lookupCachedData(session, v2Relation) session.sharedState.cacheManager.uncacheQuery(session, v2Relation, cascade = true) if (cache.isDefined) { val cacheLevel = cache.get.cachedRepresentation.cacheBuilder.storageLevel diff --git a/sql/core/src/main/scala/org/apache/spark/sql/internal/CatalogImpl.scala b/sql/core/src/main/scala/org/apache/spark/sql/internal/CatalogImpl.scala index df7c4ab1a0c7d..3e20a23a0a066 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/internal/CatalogImpl.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/internal/CatalogImpl.scala @@ -734,9 +734,8 @@ class CatalogImpl(sparkSession: SparkSession) extends Catalog { // same way as how a permanent view is handled. This also avoids a potential issue where a // dependent view becomes invalid because of the above while its data is still cached. val viewText = viewDef.desc.viewText - val plan = sparkSession.sessionState.executePlan(viewDef) - sparkSession.sharedState.cacheManager.uncacheQuery( - sparkSession, plan.analyzed, cascade = viewText.isDefined) + val df = Dataset.ofRows(sparkSession, viewDef) + sparkSession.sharedState.cacheManager.uncacheQuery(df, cascade = viewText.isDefined) } catch { case NonFatal(_) => // ignore } diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/ansi/string-functions.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/ansi/string-functions.sql.out index 7ffd3cbd8bac6..c36dec0b105d7 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/ansi/string-functions.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/ansi/string-functions.sql.out @@ -750,6 +750,13 @@ Project [decode(encode(abc, utf-8, false), utf-8) AS decode(encode(abc, utf-8), +- OneRowRelation +-- !query +select decode(encode('大千世界', 'utf-32'), 'utf-32') +-- !query analysis +Project [decode(encode(大千世界, utf-32, false), utf-32) AS decode(encode(大千世界, utf-32), utf-32)#x] ++- OneRowRelation + + -- !query select decode(1, 1, 'Southlake') -- !query analysis diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/string-functions.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/string-functions.sql.out index 7ffd3cbd8bac6..c36dec0b105d7 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/string-functions.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/string-functions.sql.out @@ -750,6 +750,13 @@ Project [decode(encode(abc, utf-8, false), utf-8) AS decode(encode(abc, utf-8), +- OneRowRelation +-- !query +select decode(encode('大千世界', 'utf-32'), 'utf-32') +-- !query analysis +Project [decode(encode(大千世界, utf-32, false), utf-32) AS decode(encode(大千世界, utf-32), utf-32)#x] ++- OneRowRelation + + -- !query select decode(1, 1, 'Southlake') -- !query analysis diff --git a/sql/core/src/test/resources/sql-tests/inputs/string-functions.sql b/sql/core/src/test/resources/sql-tests/inputs/string-functions.sql index 64ea6e655d0b5..733720a7e21b2 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/string-functions.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/string-functions.sql @@ -131,6 +131,7 @@ select encode(scol, ecol) from values('hello', 'Windows-xxx') as t(scol, ecol); select decode(); select decode(encode('abc', 'utf-8')); select decode(encode('abc', 'utf-8'), 'utf-8'); +select decode(encode('大千世界', 'utf-32'), 'utf-32'); select decode(1, 1, 'Southlake'); select decode(2, 1, 'Southlake'); select decode(2, 1, 'Southlake', 2, 'San Francisco', 3, 'New Jersey', 4, 'Seattle', 'Non domestic'); diff --git a/sql/core/src/test/resources/sql-tests/results/ansi/string-functions.sql.out b/sql/core/src/test/resources/sql-tests/results/ansi/string-functions.sql.out index 8096cef266ec4..09d4f8892fa48 100644 --- a/sql/core/src/test/resources/sql-tests/results/ansi/string-functions.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/ansi/string-functions.sql.out @@ -961,6 +961,14 @@ struct abc +-- !query +select decode(encode('大千世界', 'utf-32'), 'utf-32') +-- !query schema +struct +-- !query output +大千世界 + + -- !query select decode(1, 1, 'Southlake') -- !query schema diff --git a/sql/core/src/test/resources/sql-tests/results/string-functions.sql.out b/sql/core/src/test/resources/sql-tests/results/string-functions.sql.out index 91ad830dd3d7a..506524840f107 100644 --- a/sql/core/src/test/resources/sql-tests/results/string-functions.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/string-functions.sql.out @@ -893,6 +893,14 @@ struct abc +-- !query +select decode(encode('大千世界', 'utf-32'), 'utf-32') +-- !query schema +struct +-- !query output +大千世界 + + -- !query select decode(1, 1, 'Southlake') -- !query schema diff --git a/sql/core/src/test/scala/org/apache/spark/sql/CachedTableSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/CachedTableSuite.scala index 0ad9ceefc4196..d023fb82185a8 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/CachedTableSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/CachedTableSuite.scala @@ -1107,7 +1107,7 @@ class CachedTableSuite extends QueryTest with SQLTestUtils assert(queryStats1.map(_._1.name).isEmpty) val cacheManager = spark.sharedState.cacheManager - val cachedData = cacheManager.lookupCachedData(query().logicalPlan) + val cachedData = cacheManager.lookupCachedData(query()) assert(cachedData.isDefined) val queryAttrs = cachedData.get.plan.output assert(queryAttrs.size === 3) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/CollationSQLExpressionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/CollationSQLExpressionsSuite.scala index 4314ff97a3cf3..19f34ec15aa07 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/CollationSQLExpressionsSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/CollationSQLExpressionsSuite.scala @@ -460,6 +460,204 @@ class CollationSQLExpressionsSuite }) } + test("Support GetJsonObject json expression with collation") { + case class GetJsonObjectTestCase( + input: String, + path: String, + collationName: String, + result: String + ) + + val testCases = Seq( + GetJsonObjectTestCase("{\"a\":\"b\"}", "$.a", "UTF8_BINARY", "b"), + GetJsonObjectTestCase("{\"A\":\"1\"}", "$.A", "UTF8_BINARY_LCASE", "1"), + GetJsonObjectTestCase("{\"x\":true}", "$.x", "UNICODE", "true"), + GetJsonObjectTestCase("{\"X\":1}", "$.X", "UNICODE_CI", "1") + ) + + // Supported collations + testCases.foreach(t => { + val query = + s""" + |SELECT get_json_object('${t.input}', '${t.path}') + |""".stripMargin + // Result & data type + withSQLConf(SqlApiConf.DEFAULT_COLLATION -> t.collationName) { + val testQuery = sql(query) + checkAnswer(testQuery, Row(t.result)) + val dataType = StringType(t.collationName) + assert(testQuery.schema.fields.head.dataType.sameType(dataType)) + } + }) + } + + test("Support JsonTuple json expression with collation") { + case class JsonTupleTestCase( + input: String, + names: String, + collationName: String, + result: Row + ) + + val testCases = Seq( + JsonTupleTestCase("{\"a\":1, \"b\":2}", "'a', 'b'", "UTF8_BINARY", + Row("1", "2")), + JsonTupleTestCase("{\"A\":\"3\", \"B\":\"4\"}", "'A', 'B'", "UTF8_BINARY_LCASE", + Row("3", "4")), + JsonTupleTestCase("{\"x\":true, \"y\":false}", "'x', 'y'", "UNICODE", + Row("true", "false")), + JsonTupleTestCase("{\"X\":null, \"Y\":null}", "'X', 'Y'", "UNICODE_CI", + Row(null, null)) + ) + + // Supported collations + testCases.foreach(t => { + val query = + s""" + |SELECT json_tuple('${t.input}', ${t.names}) + |""".stripMargin + // Result & data type + withSQLConf(SqlApiConf.DEFAULT_COLLATION -> t.collationName) { + val testQuery = sql(query) + checkAnswer(testQuery, t.result) + val dataType = StringType(t.collationName) + assert(testQuery.schema.fields.head.dataType.sameType(dataType)) + } + }) + } + + test("Support JsonToStructs json expression with collation") { + case class JsonToStructsTestCase( + input: String, + schema: String, + collationName: String, + result: Row + ) + + val testCases = Seq( + JsonToStructsTestCase("{\"a\":1, \"b\":2.0}", "a INT, b DOUBLE", + "UTF8_BINARY", Row(Row(1, 2.0))), + JsonToStructsTestCase("{\"A\":\"3\", \"B\":4}", "A STRING COLLATE UTF8_BINARY_LCASE, B INT", + "UTF8_BINARY_LCASE", Row(Row("3", 4))), + JsonToStructsTestCase("{\"x\":true, \"y\":null}", "x BOOLEAN, y VOID", + "UNICODE", Row(Row(true, null))), + JsonToStructsTestCase("{\"X\":null, \"Y\":false}", "X VOID, Y BOOLEAN", + "UNICODE_CI", Row(Row(null, false))) + ) + + // Supported collations + testCases.foreach(t => { + val query = + s""" + |SELECT from_json('${t.input}', '${t.schema}') + |""".stripMargin + // Result & data type + withSQLConf(SqlApiConf.DEFAULT_COLLATION -> t.collationName) { + val testQuery = sql(query) + checkAnswer(testQuery, t.result) + val dataType = StructType.fromDDL(t.schema) + assert(testQuery.schema.fields.head.dataType.sameType(dataType)) + } + }) + } + + test("Support StructsToJson json expression with collation") { + case class StructsToJsonTestCase( + struct: String, + collationName: String, + result: Row + ) + + val testCases = Seq( + StructsToJsonTestCase("named_struct('a', 1, 'b', 2)", + "UTF8_BINARY", Row("{\"a\":1,\"b\":2}")), + StructsToJsonTestCase("array(named_struct('a', 1, 'b', 2))", + "UTF8_BINARY_LCASE", Row("[{\"a\":1,\"b\":2}]")), + StructsToJsonTestCase("map('a', named_struct('b', 1))", + "UNICODE", Row("{\"a\":{\"b\":1}}")), + StructsToJsonTestCase("array(map('a', 1))", + "UNICODE_CI", Row("[{\"a\":1}]")) + ) + + // Supported collations + testCases.foreach(t => { + val query = + s""" + |SELECT to_json(${t.struct}) + |""".stripMargin + // Result & data type + withSQLConf(SqlApiConf.DEFAULT_COLLATION -> t.collationName) { + val testQuery = sql(query) + checkAnswer(testQuery, t.result) + val dataType = StringType(t.collationName) + assert(testQuery.schema.fields.head.dataType.sameType(dataType)) + } + }) + } + + test("Support LengthOfJsonArray json expression with collation") { + case class LengthOfJsonArrayTestCase( + input: String, + collationName: String, + result: Row + ) + + val testCases = Seq( + LengthOfJsonArrayTestCase("'[1,2,3,4]'", "UTF8_BINARY", Row(4)), + LengthOfJsonArrayTestCase("'[1,2,3,{\"f1\":1,\"f2\":[5,6]},4]'", "UTF8_BINARY_LCASE", Row(5)), + LengthOfJsonArrayTestCase("'[1,2'", "UNICODE", Row(null)), + LengthOfJsonArrayTestCase("'['", "UNICODE_CI", Row(null)) + ) + + // Supported collations + testCases.foreach(t => { + val query = + s""" + |SELECT json_array_length(${t.input}) + |""".stripMargin + // Result & data type + withSQLConf(SqlApiConf.DEFAULT_COLLATION -> t.collationName) { + val testQuery = sql(query) + checkAnswer(testQuery, t.result) + assert(testQuery.schema.fields.head.dataType.sameType(IntegerType)) + } + }) + } + + test("Support JsonObjectKeys json expression with collation") { + case class JsonObjectKeysJsonArrayTestCase( + input: String, + collationName: String, + result: Row + ) + + val testCases = Seq( + JsonObjectKeysJsonArrayTestCase("{}", "UTF8_BINARY", + Row(Seq())), + JsonObjectKeysJsonArrayTestCase("{\"k\":", "UTF8_BINARY_LCASE", + Row(null)), + JsonObjectKeysJsonArrayTestCase("{\"k1\": \"v1\"}", "UNICODE", + Row(Seq("k1"))), + JsonObjectKeysJsonArrayTestCase("{\"k1\":1,\"k2\":{\"k3\":3, \"k4\":4}}", "UNICODE_CI", + Row(Seq("k1", "k2"))) + ) + + // Supported collations + testCases.foreach(t => { + val query = + s""" + |SELECT json_object_keys('${t.input}') + |""".stripMargin + // Result & data type + withSQLConf(SqlApiConf.DEFAULT_COLLATION -> t.collationName) { + val testQuery = sql(query) + checkAnswer(testQuery, t.result) + val dataType = ArrayType(StringType(t.collationName)) + assert(testQuery.schema.fields.head.dataType.sameType(dataType)) + } + }) + } + test("Support StringToMap expression with collation") { // Supported collations case class StringToMapTestCase[R](t: String, p: String, k: String, c: String, result: R) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/test/SQLTestUtils.scala b/sql/core/src/test/scala/org/apache/spark/sql/test/SQLTestUtils.scala index 7da2bb47038ed..5fbf379644f6d 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/test/SQLTestUtils.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/test/SQLTestUtils.scala @@ -339,8 +339,7 @@ private[sql] trait SQLTestUtilsBase val tableIdent = spark.sessionState.sqlParser.parseTableIdentifier(tableName) val cascade = !spark.sessionState.catalog.isTempView(tableIdent) spark.sharedState.cacheManager.uncacheQuery( - spark, - spark.table(tableName).logicalPlan, + spark.table(tableName), cascade = cascade, blocking = true) } diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/CachedTableSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/CachedTableSuite.scala index 89fe10d5c4bd9..d7918f8cbf4f0 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/CachedTableSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/CachedTableSuite.scala @@ -335,9 +335,10 @@ class CachedTableSuite extends QueryTest with SQLTestUtils with TestHiveSingleto options = Map.empty)(sparkSession = spark) val plan = LogicalRelation(relation, tableMeta) - spark.sharedState.cacheManager.cacheQuery(Dataset.ofRows(spark, plan)) + val df = Dataset.ofRows(spark, plan) + spark.sharedState.cacheManager.cacheQuery(df) - assert(spark.sharedState.cacheManager.lookupCachedData(plan).isDefined) + assert(spark.sharedState.cacheManager.lookupCachedData(df).isDefined) val sameCatalog = new CatalogFileIndex(spark, tableMeta, 0) val sameRelation = HadoopFsRelation( @@ -347,9 +348,9 @@ class CachedTableSuite extends QueryTest with SQLTestUtils with TestHiveSingleto bucketSpec = None, fileFormat = new ParquetFileFormat(), options = Map.empty)(sparkSession = spark) - val samePlan = LogicalRelation(sameRelation, tableMeta) + val samePlanDf = Dataset.ofRows(spark, LogicalRelation(sameRelation, tableMeta)) - assert(spark.sharedState.cacheManager.lookupCachedData(samePlan).isDefined) + assert(spark.sharedState.cacheManager.lookupCachedData(samePlanDf).isDefined) } }