diff --git a/.github/workflows/ci-tests.yml b/.github/workflows/ci-tests.yml index 96018b1fa..a3597bce7 100644 --- a/.github/workflows/ci-tests.yml +++ b/.github/workflows/ci-tests.yml @@ -50,7 +50,6 @@ jobs: run: | ${{github.workspace}}/dev/ci/ci-local-test.sh yarn-test: - if: ${{ false }} # disable for now name: Yarn Test for Examples (CPU) runs-on: ubuntu-20.04 steps: diff --git a/dev/ci/ci-yarn-test.sh b/dev/ci/ci-yarn-test.sh index e9a93aee3..8b1a2f5c9 100755 --- a/dev/ci/ci-yarn-test.sh +++ b/dev/ci/ci-yarn-test.sh @@ -34,7 +34,7 @@ echo "=========================================" echo "Cluster Testing with Spark Version: $SPARK_VERSION" echo "=========================================" -# Build and run all examples +# Build and run all examplesdebug#./build-all-scala.sh ./build-all-scala.sh ./run-all-scala.sh -./run-all-pyspark.sh +#./run-all-pyspark.sh diff --git a/dev/install-build-deps-ubuntu.sh b/dev/install-build-deps-ubuntu.sh index 8ae9a4e2c..e4d417418 100755 --- a/dev/install-build-deps-ubuntu.sh +++ b/dev/install-build-deps-ubuntu.sh @@ -8,6 +8,7 @@ if [ ! -d /opt/intel/oneapi ]; then | gpg --dearmor | sudo tee /usr/share/keyrings/oneapi-archive-keyring.gpg > /dev/null echo "deb [signed-by=/usr/share/keyrings/oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main" | sudo tee /etc/apt/sources.list.d/oneAPI.list sudo apt update + sudo -E apt-cache pkgnames intel | grep intel-oneapi-runtime sudo apt-get install -y intel-oneapi-ccl-devel-2021.8.0 \ intel-oneapi-tbb-common-devel-2021.8.0 intel-oneapi-tbb-devel-2021.8.0 \ intel-oneapi-mpi-devel-2021.8.0 \ diff --git a/dev/test-cluster/yarn/env.sh b/dev/test-cluster/yarn/env.sh index 3430d4c3e..94d468bc7 100755 --- a/dev/test-cluster/yarn/env.sh +++ b/dev/test-cluster/yarn/env.sh @@ -38,10 +38,10 @@ else fi # Set Spark resources, can be overwritten in example -SPARK_DRIVER_MEMORY=1G +SPARK_DRIVER_MEMORY=512M SPARK_NUM_EXECUTORS=2 SPARK_EXECUTOR_CORES=1 -SPARK_EXECUTOR_MEMORY=1G +SPARK_EXECUTOR_MEMORY=512M SPARK_TOTAL_CORES=$((SPARK_NUM_EXECUTORS * SPARK_EXECUTOR_CORES)) SPARK_DEFAULT_PARALLELISM=$((SPARK_TOTAL_CORES * 2)) diff --git a/dev/test-cluster/yarn/hadoop-env.sh b/dev/test-cluster/yarn/hadoop-env.sh index f60b65a0b..f6e93a3ce 100755 --- a/dev/test-cluster/yarn/hadoop-env.sh +++ b/dev/test-cluster/yarn/hadoop-env.sh @@ -96,3 +96,5 @@ export HADOOP_SECURE_DN_PID_DIR=${HADOOP_PID_DIR} # A string representing this instance of hadoop. $USER by default. export HADOOP_IDENT_STRING=$USER +source /opt/intel/oneapi/setvars.sh --ccl-configuration=cpu +export FI_TCP_IFACE=eth0 diff --git a/dev/test-cluster/yarn/load-spark-envs.sh b/dev/test-cluster/yarn/load-spark-envs.sh index 7ee0cb452..19c7ad6b7 100755 --- a/dev/test-cluster/yarn/load-spark-envs.sh +++ b/dev/test-cluster/yarn/load-spark-envs.sh @@ -15,5 +15,6 @@ export PYTHONPATH=$SPARK_HOME/python:$PYTHONPATH export PYSPARK_PYTHON=python3 export PATH=$HADOOP_HOME/bin:$SPARK_HOME/bin:$PATH +export FI_TCP_IFACE=eth0 set +x diff --git a/dev/test-cluster/yarn/setup-cluster.sh b/dev/test-cluster/yarn/setup-cluster.sh index d57edf90c..18e9a6e15 100755 --- a/dev/test-cluster/yarn/setup-cluster.sh +++ b/dev/test-cluster/yarn/setup-cluster.sh @@ -40,6 +40,7 @@ cp ./yarn-site.xml ~/opt/hadoop-$HADOOP_VERSION/etc/hadoop/ cp ./hadoop-env.sh ~/opt/hadoop-$HADOOP_VERSION/etc/hadoop/ cp ../log4j.properties ~/opt/spark-$SPARK_VERSION-bin-$SPARK_HADOOP_VERSION/conf cp ./spark-defaults.conf ~/opt/spark-$SPARK_VERSION-bin-$SPARK_HADOOP_VERSION/conf +cp ~/opt/spark-$SPARK_VERSION-bin-$SPARK_HADOOP_VERSION/yarn/spark-*-yarn-shuffle.jar ~/opt/hadoop-$HADOOP_VERSION/share/hadoop/yarn/lib/ echo $HOST_IP > $HADOOP_HOME/etc/hadoop/slaves echo $HOST_IP > $SPARK_HOME/conf/slaves @@ -53,10 +54,20 @@ mkdir -p /tmp/run/hdfs/datanode # hdfs format $HADOOP_HOME/bin/hdfs namenode -format +wget -P $HADOOP_HOME/share/hadoop/yarn/lib/ https://repo1.maven.org/maven2/javax/activation/activation/1.1.1/activation-1.1.1.jar + # start hdfs and yarn $HADOOP_HOME/sbin/start-dfs.sh $HADOOP_HOME/sbin/start-yarn.sh +jps +free -g +df -h +yarn application -list +ls -ls $HADOOP_HOME/logs/ +cat $HADOOP_HOME/logs/hadoop-*-resourcemanager-*.log +cat $HADOOP_HOME/logs/hadoop-*-nodemanager-*.log + hadoop fs -ls / yarn node -list diff --git a/dev/test-cluster/yarn/spark-defaults.conf b/dev/test-cluster/yarn/spark-defaults.conf index 04ed10b97..19cba6e71 100644 --- a/dev/test-cluster/yarn/spark-defaults.conf +++ b/dev/test-cluster/yarn/spark-defaults.conf @@ -25,10 +25,3 @@ # spark.serializer org.apache.spark.serializer.KryoSerializer # spark.driver.memory 5g # spark.executor.extraJavaOptions -XX:+PrintGCDetails -Dkey=value -Dnumbers="one two three" - -spark.master yarn -spark.serializer org.apache.spark.serializer.KryoSerializer -spark.driver.memory 1g -spark.executor.num 2 -spark.executor.cores 1 -spark.executor.memory 2g diff --git a/dev/test-cluster/yarn/yarn-site.xml b/dev/test-cluster/yarn/yarn-site.xml index ff74d23a7..9f6627bef 100644 --- a/dev/test-cluster/yarn/yarn-site.xml +++ b/dev/test-cluster/yarn/yarn-site.xml @@ -33,7 +33,7 @@ yarn.nodemanager.resource.memory-mb - 7168 + 3072 yarn.nodemanager.resource.cpu-vcores @@ -43,17 +43,21 @@ yarn.nodemanager.vmem-check-enabled false + + yarn.nodemanager.pmem-check-enabled + false + yarn.nodemanager.vmem-pmem-ratio - 2 + 1 yarn.scheduler.minimum-allocation-mb - 1024 + 256 yarn.scheduler.maximum-allocation-mb - 7168 + 3072 yarn.scheduler.minimum-allocation-vcores @@ -63,5 +67,8 @@ yarn.scheduler.maximum-allocation-vcores 2 - + + yarn.nodemanager.env-whitelist + JAVA_HOME,HADOOP_COMMON_HOME,HADOOP_HDFS_HOME,HADOOP_CONF_DIR,HADOOP_YARN_HOME,LD_LIBRARY_PATH,FI_TCP_IFACE,CMPLR_ROOT,DAALROOT,CCL_ROOT + diff --git a/examples/build-all-scala.sh b/examples/build-all-scala.sh index 8babb9d97..ad507e031 100755 --- a/examples/build-all-scala.sh +++ b/examples/build-all-scala.sh @@ -1,6 +1,6 @@ #!/usr/bin/env bash -exampleDirs=(kmeans pca als naive-bayes linear-regression correlation summarizer) +exampleDirs=(kmeans) for dir in ${exampleDirs[*]} do diff --git a/examples/run-all-scala.sh b/examples/run-all-scala.sh index 04bab7f8a..241a861ca 100755 --- a/examples/run-all-scala.sh +++ b/examples/run-all-scala.sh @@ -1,6 +1,6 @@ #!/usr/bin/env bash -exampleDirs=(kmeans pca als naive-bayes linear-regression correlation summarizer) +exampleDirs=(kmeans) for dir in ${exampleDirs[*]} do