apache · SandishKumarHN · Aug 11, 2022 · Aug 18, 2022 · Aug 18, 2022 · Aug 21, 2022
diff --git a/.github/labeler.yml b/.github/labeler.yml
@@ -155,3 +155,6 @@ CONNECT:
   - "connector/connect/**/*"
   - "**/sql/sparkconnect/**/*"
   - "python/pyspark/sql/**/connect/**/*"
+PROTOBUF:
+ - "connector/protobuf/**/*"
+ - "python/pyspark/sql/protobuf/**/*"
diff --git a/connector/protobuf/pom.xml b/connector/protobuf/pom.xml
@@ -0,0 +1,115 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+  ~ Licensed to the Apache Software Foundation (ASF) under one or more
+  ~ contributor license agreements.  See the NOTICE file distributed with
+  ~ this work for additional information regarding copyright ownership.
+  ~ The ASF licenses this file to You under the Apache License, Version 2.0
+  ~ (the "License"); you may not use this file except in compliance with
+  ~ the License.  You may obtain a copy of the License at
+  ~
+  ~    http://www.apache.org/licenses/LICENSE-2.0
+  ~
+  ~ Unless required by applicable law or agreed to in writing, software
+  ~ distributed under the License is distributed on an "AS IS" BASIS,
+  ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  ~ See the License for the specific language governing permissions and
+  ~ limitations under the License.
+  -->
+
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+  <modelVersion>4.0.0</modelVersion>
+  <parent>
+    <groupId>org.apache.spark</groupId>
+    <artifactId>spark-parent_2.12</artifactId>
+    <version>3.4.0-SNAPSHOT</version>
+    <relativePath>../../pom.xml</relativePath>
+  </parent>
+
+  <artifactId>spark-protobuf_2.12</artifactId>
+  <properties>
+    <sbt.project.name>protobuf</sbt.project.name>
+    <protobuf.version>3.21.1</protobuf.version>
+  </properties>
+  <packaging>jar</packaging>
+  <name>Spark Protobuf</name>
+  <url>https://spark.apache.org/</url>
+
+  <dependencies>
+    <dependency>
+      <groupId>org.apache.spark</groupId>
+      <artifactId>spark-sql_${scala.binary.version}</artifactId>
+      <version>${project.version}</version>
+      <scope>provided</scope>
+    </dependency>
+    <dependency>
+      <groupId>org.apache.spark</groupId>
+      <artifactId>spark-core_${scala.binary.version}</artifactId>
+      <version>${project.version}</version>
+      <type>test-jar</type>
+      <scope>test</scope>
+    </dependency>
+    <dependency>
+      <groupId>org.apache.spark</groupId>
+      <artifactId>spark-catalyst_${scala.binary.version}</artifactId>
+      <version>${project.version}</version>
+      <type>test-jar</type>
+      <scope>test</scope>
+    </dependency>
+    <dependency>
+      <groupId>org.apache.spark</groupId>
+      <artifactId>spark-sql_${scala.binary.version}</artifactId>
+      <version>${project.version}</version>
+      <type>test-jar</type>
+      <scope>test</scope>
+    </dependency>
+    <dependency>
+      <groupId>org.scalacheck</groupId>
+      <artifactId>scalacheck_${scala.binary.version}</artifactId>
+      <scope>test</scope>
+    </dependency>
+    <dependency>
+      <groupId>org.apache.spark</groupId>
+      <artifactId>spark-tags_${scala.binary.version}</artifactId>
+    </dependency>
+    <!-- #if scala-2.13 --><!--
+    <dependency>
+      <groupId>org.scala-lang.modules</groupId>
+      <artifactId>scala-parallel-collections_${scala.binary.version}</artifactId>
+    </dependency>
+    --><!-- #endif scala-2.13 -->
+    <dependency>
+      <groupId>com.google.protobuf</groupId>
+      <artifactId>protobuf-java</artifactId>
+      <version>${protobuf.version}</version>
+      <scope>compile</scope>
+    </dependency>
+
+  </dependencies>
+  <build>
+    <outputDirectory>target/scala-${scala.binary.version}/classes</outputDirectory>
+    <testOutputDirectory>target/scala-${scala.binary.version}/test-classes</testOutputDirectory>
+    <plugins>
+      <plugin>
+        <groupId>org.apache.maven.plugins</groupId>
+        <artifactId>maven-shade-plugin</artifactId>
+        <configuration>
+          <shadedArtifactAttached>false</shadedArtifactAttached>
+          <artifactSet>
+            <includes>
+              <include>com.google.protobuf:*</include>
+            </includes>
+          </artifactSet>
+          <relocations>
+            <relocation>
+              <pattern>com.google.protobuf</pattern>
+              <shadedPattern>${spark.shade.packageName}.spark-protobuf.protobuf</shadedPattern>
+              <includes>
+                <include>com.google.protobuf.**</include>
+              </includes>
+            </relocation>
+          </relocations>
+        </configuration>
+      </plugin>
+    </plugins>
+  </build>
+</project>
diff --git a/connector/protobuf/src/main/scala/org/apache/spark/sql/protobuf/CatalystDataToProtobuf.scala b/connector/protobuf/src/main/scala/org/apache/spark/sql/protobuf/CatalystDataToProtobuf.scala
@@ -0,0 +1,54 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.sql.protobuf
+
+import com.google.protobuf.DynamicMessage
+
+import org.apache.spark.sql.catalyst.expressions.{Expression, UnaryExpression}
+import org.apache.spark.sql.catalyst.expressions.codegen.{CodegenContext, ExprCode}
+import org.apache.spark.sql.protobuf.utils.ProtobufUtils
+import org.apache.spark.sql.types.{BinaryType, DataType}
+
+private[protobuf] case class CatalystDataToProtobuf(
+    child: Expression,
+    descFilePath: String,
+    messageName: String)
+    extends UnaryExpression {
+
+  override def dataType: DataType = BinaryType
+
+  @transient private lazy val protoType =
+    ProtobufUtils.buildDescriptor(descFilePath, messageName)
+
+  @transient private lazy val serializer =
+    new ProtobufSerializer(child.dataType, protoType, child.nullable)
+
+  override def nullSafeEval(input: Any): Any = {
+    val dynamicMessage = serializer.serialize(input).asInstanceOf[DynamicMessage]
+    dynamicMessage.toByteArray
+  }
+
+  override def prettyName: String = "to_protobuf"
+
+  override protected def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
+    val expr = ctx.addReferenceObj("this", this)
+    defineCodeGen(ctx, ev, input => s"(byte[]) $expr.nullSafeEval($input)")
+  }
+
+  override protected def withNewChildInternal(newChild: Expression): CatalystDataToProtobuf =
+    copy(child = newChild)
+}
diff --git a/connector/protobuf/src/main/scala/org/apache/spark/sql/protobuf/ProtobufDataToCatalyst.scala b/connector/protobuf/src/main/scala/org/apache/spark/sql/protobuf/ProtobufDataToCatalyst.scala
@@ -0,0 +1,160 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.sql.protobuf
+
+import scala.collection.JavaConverters._
+import scala.util.control.NonFatal
+
+import com.google.protobuf.DynamicMessage
+
+import org.apache.spark.SparkException
+import org.apache.spark.sql.AnalysisException
+import org.apache.spark.sql.catalyst.expressions.{ExpectsInputTypes, Expression, SpecificInternalRow, UnaryExpression}
+import org.apache.spark.sql.catalyst.expressions.codegen.{CodegenContext, CodeGenerator, ExprCode}
+import org.apache.spark.sql.catalyst.util.{FailFastMode, ParseMode, PermissiveMode}
+import org.apache.spark.sql.protobuf.utils.{ProtobufOptions, ProtobufUtils, SchemaConverters}
+import org.apache.spark.sql.types.{AbstractDataType, BinaryType, DataType, StructType}
+
+private[protobuf] case class ProtobufDataToCatalyst(
+    child: Expression,
+    descFilePath: String,
+    messageName: String,
+    options: Map[String, String])
+    extends UnaryExpression
+    with ExpectsInputTypes {
+
+  override def inputTypes: Seq[AbstractDataType] = Seq(BinaryType)
+
+  override lazy val dataType: DataType = {
+    val dt = SchemaConverters.toSqlType(messageDescriptor).dataType
+    parseMode match {
+      // With PermissiveMode, the output Catalyst row might contain columns of null values for
+      // corrupt records, even if some of the columns are not nullable in the user-provided schema.
+      // Therefore we force the schema to be all nullable here.
+      case PermissiveMode => dt.asNullable
+      case _ => dt
+    }
+  }
+
+  override def nullable: Boolean = true
+
+  private lazy val protobufOptions = ProtobufOptions(options)
+
+  @transient private lazy val messageDescriptor =
+    ProtobufUtils.buildDescriptor(descFilePath, messageName)
+
+  @transient private lazy val fieldsNumbers =
+    messageDescriptor.getFields.asScala.map(f => f.getNumber)
+
+  @transient private lazy val deserializer = new ProtobufDeserializer(messageDescriptor, dataType)
+
+  @transient private var result: DynamicMessage = _
+
+  @transient private lazy val parseMode: ParseMode = {
+    val mode = protobufOptions.parseMode
+    if (mode != PermissiveMode && mode != FailFastMode) {
+      throw new AnalysisException(unacceptableModeMessage(mode.name))
+    }
+    mode
+  }
+
+  private def unacceptableModeMessage(name: String): String = {
+    s"from_protobuf() doesn't support the $name mode. " +
+      s"Acceptable modes are ${PermissiveMode.name} and ${FailFastMode.name}."
+  }
+
+  @transient private lazy val nullResultRow: Any = dataType match {
+    case st: StructType =>
+      val resultRow = new SpecificInternalRow(st.map(_.dataType))
+      for (i <- 0 until st.length) {
+        resultRow.setNullAt(i)
+      }
+      resultRow
+
+    case _ =>
+      null
+  }
+
+  private def handleException(e: Throwable): Any = {
+    parseMode match {
+      case PermissiveMode =>
+        nullResultRow
+      case FailFastMode =>
+        throw new SparkException(
+          "Malformed records are detected in record parsing. " +
+            s"Current parse Mode: ${FailFastMode.name}. To process malformed records as null " +
+            "result, try setting the option 'mode' as 'PERMISSIVE'.",
+          e)
+      case _ =>
+        throw new AnalysisException(unacceptableModeMessage(parseMode.name))
+    }
+  }
+
+  override def nullSafeEval(input: Any): Any = {
+    val binary = input.asInstanceOf[Array[Byte]]
+    try {
+      result = DynamicMessage.parseFrom(messageDescriptor, binary)
+      val unknownFields = result.getUnknownFields
+      if (!unknownFields.asMap().isEmpty) {
+        unknownFields.asMap().keySet().asScala.map { number =>
+          {
+            if (fieldsNumbers.contains(number)) {
+              return handleException(
+                new Throwable(s"Type mismatch encountered for field:" +
+                  s" ${messageDescriptor.getFields.get(number)}"))
+            }
+          }
+        }
+      }
+      val deserialized = deserializer.deserialize(result)
+      assert(
+        deserialized.isDefined,
+        "Protobuf deserializer cannot return an empty result because filters are not pushed down")
+      deserialized.get
+    } catch {
+      // There could be multiple possible exceptions here, e.g. java.io.IOException,
+      // ProtoRuntimeException, ArrayIndexOutOfBoundsException, etc.
+      // To make it simple, catch all the exceptions here.
+      case NonFatal(e) =>
+        handleException(e)
+    }
+  }
+
+  override def prettyName: String = "from_protobuf"
+
+  override protected def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
+    val expr = ctx.addReferenceObj("this", this)
+    nullSafeCodeGen(
+      ctx,
+      ev,
+      eval => {
+        val result = ctx.freshName("result")
+        val dt = CodeGenerator.boxedType(dataType)
+        s"""
+        $dt $result = ($dt) $expr.nullSafeEval($eval);
+        if ($result == null) {
+          ${ev.isNull} = true;
+        } else {
+          ${ev.value} = $result;
+        }
+      """
+      })
+  }
+
+  override protected def withNewChildInternal(newChild: Expression): ProtobufDataToCatalyst =
+    copy(child = newChild)
+}