Skip to content

Commit

Permalink
[HUDI-4982] Add validation job for spark bundles in GitHub Actions (#…
Browse files Browse the repository at this point in the history
  • Loading branch information
xushiyan authored Oct 19, 2022
1 parent 048299e commit 3c8988c
Show file tree
Hide file tree
Showing 7 changed files with 296 additions and 0 deletions.
9 changes: 9 additions & 0 deletions .github/workflows/bot.yml
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,15 @@ jobs:
FLINK_PROFILE: ${{ matrix.flinkProfile }}
run:
mvn test -Punit-tests -D"$SCALA_PROFILE" -D"$SPARK_PROFILE" -D"$FLINK_PROFILE" -DfailIfNoTests=false -pl hudi-examples/hudi-examples-flink,hudi-examples/hudi-examples-java,hudi-examples/hudi-examples-spark $MVN_ARGS
- name: Bundle Validation
env:
SCALA_PROFILE: ${{ matrix.scalaProfile }}
SPARK_PROFILE: ${{ matrix.sparkProfile }}
FLINK_PROFILE: ${{ matrix.flinkProfile }}
if: ${{ !endsWith(env.SPARK_PROFILE, '2.4') }} # skip test spark 2.4 as it's covered by Azure CI
run: |
HUDI_VERSION=$(mvn help:evaluate -Dexpression=project.version -q -DforceStdout)
./packaging/bundle-validation/spark-write-hive-sync/ci_run.sh $HUDI_VERSION
- name: Spark SQL Test
env:
SCALA_PROFILE: ${{ matrix.scalaProfile }}
Expand Down
56 changes: 56 additions & 0 deletions packaging/bundle-validation/spark-write-hive-sync/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
FROM adoptopenjdk/openjdk8:alpine

RUN apk add --no-cache --upgrade bash

RUN mkdir /opt/hudi-bundles
ENV WORKDIR=/opt/hudi-bundles
WORKDIR $WORKDIR

ARG HADOOP_VERSION=2.7.7
ARG HIVE_VERSION=3.1.3
ARG DERBY_VERSION=10.14.1.0
ARG SPARK_VERSION=3.1.3
ARG SPARK_HADOOP_VERSION=2.7

RUN wget https://archive.apache.org/dist/hadoop/common/hadoop-$HADOOP_VERSION/hadoop-$HADOOP_VERSION.tar.gz -P "$WORKDIR" \
&& tar -xf $WORKDIR/hadoop-$HADOOP_VERSION.tar.gz -C $WORKDIR/ \
&& rm $WORKDIR/hadoop-$HADOOP_VERSION.tar.gz
ENV HADOOP_HOME=$WORKDIR/hadoop-$HADOOP_VERSION

RUN wget https://archive.apache.org/dist/hive/hive-$HIVE_VERSION/apache-hive-$HIVE_VERSION-bin.tar.gz -P "$WORKDIR" \
&& tar -xf $WORKDIR/apache-hive-$HIVE_VERSION-bin.tar.gz -C $WORKDIR/ \
&& rm $WORKDIR/apache-hive-$HIVE_VERSION-bin.tar.gz
ENV HIVE_HOME=$WORKDIR/apache-hive-$HIVE_VERSION-bin

RUN wget https://archive.apache.org/dist/db/derby/db-derby-$DERBY_VERSION/db-derby-$DERBY_VERSION-bin.tar.gz -P "$WORKDIR" \
&& tar -xf $WORKDIR/db-derby-$DERBY_VERSION-bin.tar.gz -C $WORKDIR/ \
&& rm $WORKDIR/db-derby-$DERBY_VERSION-bin.tar.gz
ENV DERBY_HOME=$WORKDIR/db-derby-$DERBY_VERSION-bin

RUN wget https://archive.apache.org/dist/spark/spark-$SPARK_VERSION/spark-$SPARK_VERSION-bin-hadoop$SPARK_HADOOP_VERSION.tgz -P "$WORKDIR" \
&& tar -xf $WORKDIR/spark-$SPARK_VERSION-bin-hadoop$SPARK_HADOOP_VERSION.tgz -C $WORKDIR/ \
&& rm $WORKDIR/spark-$SPARK_VERSION-bin-hadoop$SPARK_HADOOP_VERSION.tgz
ENV SPARK_HOME=$WORKDIR/spark-$SPARK_VERSION-bin-hadoop$SPARK_HADOOP_VERSION

RUN cp $DERBY_HOME/lib/derbyclient.jar $SPARK_HOME/jars/
COPY hive-site.xml $HIVE_HOME/conf/
RUN ln -sf $HIVE_HOME/conf/hive-site.xml $SPARK_HOME/conf/hive-site.xml
COPY spark-defaults.conf $SPARK_HOME/conf/
COPY validate.scala .
COPY validate.sh .
71 changes: 71 additions & 0 deletions packaging/bundle-validation/spark-write-hive-sync/ci_run.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
#!/bin/bash

# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.

# Note:
# this script is to run by GitHub Actions CI tasks from the project root directory
# and contains environment-specific variables

HUDI_VERSION=$1
# to store bundle jars for validation
mkdir ${GITHUB_WORKSPACE}/jars
cp packaging/hudi-spark-bundle/target/hudi-*-$HUDI_VERSION.jar ${GITHUB_WORKSPACE}/jars
echo 'Validating jars below:'
ls -l ${GITHUB_WORKSPACE}/jars

# choose versions based on build profiles
if [[ ${SPARK_PROFILE} == 'spark2.4' ]]; then
HADOOP_VERSION=2.7.7
HIVE_VERSION=2.3.9
DERBY_VERSION=10.10.2.0
SPARK_VERSION=2.4.8
SPARK_HADOOP_VERSION=2.7
IMAGE_TAG=spark248hive239
elif [[ ${SPARK_PROFILE} == 'spark3.1' ]]; then
HADOOP_VERSION=2.7.7
HIVE_VERSION=3.1.3
DERBY_VERSION=10.14.1.0
SPARK_VERSION=3.1.3
SPARK_HADOOP_VERSION=2.7
IMAGE_TAG=spark313hive313
elif [[ ${SPARK_PROFILE} == 'spark3.2' ]]; then
HADOOP_VERSION=2.7.7
HIVE_VERSION=3.1.3
DERBY_VERSION=10.14.1.0
SPARK_VERSION=3.2.2
SPARK_HADOOP_VERSION=2.7
IMAGE_TAG=spark322hive313
elif [[ ${SPARK_PROFILE} == 'spark3.3' ]]; then
HADOOP_VERSION=2.7.7
HIVE_VERSION=3.1.3
DERBY_VERSION=10.14.1.0
SPARK_VERSION=3.3.0
SPARK_HADOOP_VERSION=2
IMAGE_TAG=spark330hive313
fi

cd packaging/bundle-validation/spark-write-hive-sync || exit 1
docker build \
--build-arg HADOOP_VERSION=$HADOOP_VERSION \
--build-arg HIVE_VERSION=$HIVE_VERSION \
--build-arg DERBY_VERSION=$DERBY_VERSION \
--build-arg SPARK_VERSION=$SPARK_VERSION \
--build-arg SPARK_HADOOP_VERSION=$SPARK_HADOOP_VERSION \
-t hudi-ci-bundle-validation:$IMAGE_TAG \
.
docker run -v ${GITHUB_WORKSPACE}/jars:/opt/hudi-bundles/jars -i hudi-ci-bundle-validation:$IMAGE_TAG bash validate.sh
53 changes: 53 additions & 0 deletions packaging/bundle-validation/spark-write-hive-sync/hive-site.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<configuration>
<property>
<name>system:user.name</name>
<value>${user.name}</value>
</property>
<property>
<name>system:java.io.tmpdir</name>
<value>file:///tmp/hudi-bundles/hive/java</value>
</property>
<property>
<name>hive.exec.scratchdir</name>
<value>file:///tmp/hudi-bundles/hive/exec</value>
</property>
<property>
<name>hive.metastore.warehouse.dir</name>
<value>file:///tmp/hudi-bundles/hive/warehouse</value>
</property>
<property>
<name>hive.metastore.schema.verification</name>
<value>false</value>
</property>
<!-- TODO: use autoCreateAll = false for hive 2.x -->
<property>
<name>datanucleus.schema.autoCreateAll</name>
<value>true</value>
</property>
<property>
<name>javax.jdo.option.ConnectionDriverName</name>
<value>org.apache.derby.jdbc.ClientDriver</value>
</property>
<property>
<name>javax.jdo.option.ConnectionURL</name>
<value>jdbc:derby://localhost:1527/default;create=true</value>
</property>
</configuration>
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

spark.serializer org.apache.spark.serializer.KryoSerializer
spark.sql.extensions org.apache.spark.sql.hudi.HoodieSparkSessionExtension
spark.sql.warehouse.dir file:///tmp/hudi-bundles/hive/warehouse
57 changes: 57 additions & 0 deletions packaging/bundle-validation/spark-write-hive-sync/validate.scala
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

import org.apache.hudi.QuickstartUtils._
import scala.collection.JavaConversions._
import org.apache.spark.sql.SaveMode._
import org.apache.hudi.DataSourceReadOptions._
import org.apache.hudi.DataSourceWriteOptions._
import org.apache.hudi.config.HoodieWriteConfig._
import org.apache.hudi.common.model.HoodieRecord

val expected = 10
val database = "default"
val tableName = "trips"
val basePath = "file:///tmp/hudi-bundles/tests/" + tableName
val dataGen = new DataGenerator
val inserts = convertToStringList(dataGen.generateInserts(expected))
val df = spark.read.json(spark.sparkContext.parallelize(inserts, 2))
df.write.format("hudi").
options(getQuickstartWriteConfigs).
option(PRECOMBINE_FIELD_OPT_KEY, "ts").
option(RECORDKEY_FIELD_OPT_KEY, "uuid").
option(PARTITIONPATH_FIELD_OPT_KEY, "partitionpath").
option(TABLE_NAME, tableName).
option("hoodie.datasource.meta.sync.enable", "true").
option("hoodie.datasource.hive_sync.database", database).
option("hoodie.datasource.hive_sync.table", tableName).
option("hoodie.datasource.hive_sync.partition_extractor_class", "org.apache.hudi.hive.SinglePartPartitionValueExtractor").
option("hoodie.datasource.hive_sync.mode", "hms").
option("hoodie.datasource.hive_sync.metastore.uris", "thrift://localhost:9083/").
mode(Overwrite).
save(basePath)

spark.sql("desc " + tableName).show
val actual = spark.sql("select * from " + tableName).count
if (expected == actual) {
System.out.println($"bundle combination passed sanity run.")
System.exit(0)
} else {
System.err.println($"bundle combination failed sanity run:\n\tshould have written $expected records in $database.$tableName")
System.exit(1)
}
30 changes: 30 additions & 0 deletions packaging/bundle-validation/spark-write-hive-sync/validate.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
#!/bin/bash

# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.

# NOTE: this script runs inside hudi-ci-bundle-validation container
# $WORKDIR/jars/ is supposed to be mounted to a host directory where bundle jars are placed
# TODO: $JAR_COMBINATIONS should have different orders for different jars to detect class loading issues

$DERBY_HOME/bin/startNetworkServer -h 0.0.0.0 &
$HIVE_HOME/bin/hiveserver2 &
WORKDIR=/opt/hudi-bundles
JAR_COMBINATIONS=$(echo $WORKDIR/jars/*.jar | tr ' ' ',')
$SPARK_HOME/bin/spark-shell --jars $JAR_COMBINATIONS < $WORKDIR/validate.scala

exit $?

0 comments on commit 3c8988c

Please sign in to comment.