[ML-12] Improve CI and add pseudo cluster testing (#20)

oap-project · Mar 5, 2021 · 6fe5d3e · 6fe5d3e
1 parent e1c33d9
commit 6fe5d3e
Show file tree

Hide file tree

Showing 12 changed files with 461 additions and 0 deletions.
diff --git a/dev/test-cluster/config-ssh.sh b/dev/test-cluster/config-ssh.sh
@@ -0,0 +1,6 @@
+#!/usr/bin/env bash
+
+ssh-keygen -q -N "" -t rsa -f ~/.ssh/id_rsa
+cat ~/.ssh/id_rsa.pub >> ~/.ssh/authorized_keys
+echo "    StrictHostKeyChecking no                     " | sudo tee -a /etc/ssh/ssh_config
+sudo service ssh restart
diff --git a/dev/test-cluster/core-site.xml b/dev/test-cluster/core-site.xml
@@ -0,0 +1,24 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
+<!--
+  Licensed under the Apache License, Version 2.0 (the "License");
+  you may not use this file except in compliance with the License.
+  You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License. See accompanying LICENSE file.
+-->
+
+<!-- Put site-specific property overrides in this file. -->
+
+<configuration>
+	<property>
+        <name>fs.default.name</name>
+    	<value>hdfs://localhost:8020</value>
+    </property>
+</configuration>
diff --git a/dev/test-cluster/envs.sh b/dev/test-cluster/envs.sh
@@ -0,0 +1,22 @@
+# Set user Spark and Hadoop home directory
+export HADOOP_HOME=~/opt/hadoop-2.7.7
+export HADOOP_CONF_DIR=$HADOOP_HOME/etc/hadoop
+export SPARK_HOME=~/opt/spark-3.0.0-bin-hadoop2.7
+
+export PYTHONPATH=$SPARK_HOME/python:$PYTHONPATH
+export PYSPARK_PYTHON=python3
+
+# Set user HDFS Root
+export HDFS_ROOT=hdfs://localhost:8020
+export OAP_MLLIB_DATA_ROOT=OAPMLlib/Data
+# Set user Intel MLlib Root directory
+export OAP_MLLIB_ROOT=${GITHUB_WORKSPACE}
+
+# Target jar built
+OAP_MLLIB_JAR_NAME=oap-mllib-1.1.0.jar
+OAP_MLLIB_JAR=$OAP_MLLIB_ROOT/mllib-dal/target/$OAP_MLLIB_JAR_NAME
+
+# Use absolute path
+SPARK_DRIVER_CLASSPATH=$OAP_MLLIB_JAR
+# Use relative path
+SPARK_EXECUTOR_CLASSPATH=./$OAP_MLLIB_JAR_NAME
diff --git a/dev/test-cluster/hadoop-env.sh b/dev/test-cluster/hadoop-env.sh
@@ -0,0 +1,99 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Set Hadoop-specific environment variables here.
+
+# The only required environment variable is JAVA_HOME.  All others are
+# optional.  When running a distributed configuration it is best to
+# set JAVA_HOME in this file, so that it is correctly defined on
+# remote nodes.
+
+# The java implementation to use.
+# export JAVA_HOME=${JAVA_HOME}
+export JAVA_HOME=/usr/local/lib/jvm/openjdk8
+
+# The jsvc implementation to use. Jsvc is required to run secure datanodes
+# that bind to privileged ports to provide authentication of data transfer
+# protocol.  Jsvc is not required if SASL is configured for authentication of
+# data transfer protocol using non-privileged ports.
+#export JSVC_HOME=${JSVC_HOME}
+
+export HADOOP_CONF_DIR=${HADOOP_CONF_DIR:-"/etc/hadoop"}
+
+# Extra Java CLASSPATH elements.  Automatically insert capacity-scheduler.
+for f in $HADOOP_HOME/contrib/capacity-scheduler/*.jar; do
+  if [ "$HADOOP_CLASSPATH" ]; then
+    export HADOOP_CLASSPATH=$HADOOP_CLASSPATH:$f
+  else
+    export HADOOP_CLASSPATH=$f
+  fi
+done
+
+# The maximum amount of heap to use, in MB. Default is 1000.
+#export HADOOP_HEAPSIZE=
+#export HADOOP_NAMENODE_INIT_HEAPSIZE=""
+
+# Extra Java runtime options.  Empty by default.
+export HADOOP_OPTS="$HADOOP_OPTS -Djava.net.preferIPv4Stack=true"
+
+# Command specific options appended to HADOOP_OPTS when specified
+export HADOOP_NAMENODE_OPTS="-Dhadoop.security.logger=${HADOOP_SECURITY_LOGGER:-INFO,RFAS} -Dhdfs.audit.logger=${HDFS_AUDIT_LOGGER:-INFO,NullAppender} $HADOOP_NAMENODE_OPTS"
+export HADOOP_DATANODE_OPTS="-Dhadoop.security.logger=ERROR,RFAS $HADOOP_DATANODE_OPTS"
+
+export HADOOP_SECONDARYNAMENODE_OPTS="-Dhadoop.security.logger=${HADOOP_SECURITY_LOGGER:-INFO,RFAS} -Dhdfs.audit.logger=${HDFS_AUDIT_LOGGER:-INFO,NullAppender} $HADOOP_SECONDARYNAMENODE_OPTS"
+
+export HADOOP_NFS3_OPTS="$HADOOP_NFS3_OPTS"
+export HADOOP_PORTMAP_OPTS="-Xmx512m $HADOOP_PORTMAP_OPTS"
+
+# The following applies to multiple commands (fs, dfs, fsck, distcp etc)
+export HADOOP_CLIENT_OPTS="-Xmx512m $HADOOP_CLIENT_OPTS"
+#HADOOP_JAVA_PLATFORM_OPTS="-XX:-UsePerfData $HADOOP_JAVA_PLATFORM_OPTS"
+
+# On secure datanodes, user to run the datanode as after dropping privileges.
+# This **MUST** be uncommented to enable secure HDFS if using privileged ports
+# to provide authentication of data transfer protocol.  This **MUST NOT** be
+# defined if SASL is configured for authentication of data transfer protocol
+# using non-privileged ports.
+export HADOOP_SECURE_DN_USER=${HADOOP_SECURE_DN_USER}
+
+# Where log files are stored.  $HADOOP_HOME/logs by default.
+#export HADOOP_LOG_DIR=${HADOOP_LOG_DIR}/$USER
+
+# Where log files are stored in the secure data environment.
+export HADOOP_SECURE_DN_LOG_DIR=${HADOOP_LOG_DIR}/${HADOOP_HDFS_USER}
+
+###
+# HDFS Mover specific parameters
+###
+# Specify the JVM options to be used when starting the HDFS Mover.
+# These options will be appended to the options specified as HADOOP_OPTS
+# and therefore may override any similar flags set in HADOOP_OPTS
+#
+# export HADOOP_MOVER_OPTS=""
+
+###
+# Advanced Users Only!
+###
+
+# The directory where pid files are stored. /tmp by default.
+# NOTE: this should be set to a directory that can only be written to by 
+#       the user that will run the hadoop daemons.  Otherwise there is the
+#       potential for a symlink attack.
+export HADOOP_PID_DIR=${HADOOP_PID_DIR}
+export HADOOP_SECURE_DN_PID_DIR=${HADOOP_PID_DIR}
+
+# A string representing this instance of hadoop. $USER by default.
+export HADOOP_IDENT_STRING=$USER
diff --git a/dev/test-cluster/hdfs-site.xml b/dev/test-cluster/hdfs-site.xml
@@ -0,0 +1,32 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
+<!--
+  Licensed under the Apache License, Version 2.0 (the "License");
+  you may not use this file except in compliance with the License.
+  You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License. See accompanying LICENSE file.
+-->
+
+<!-- Put site-specific property overrides in this file. -->
+
+<configuration>
+	<property>
+		<name>dfs.replication</name>
+		<value>1</value>
+	</property>
+	<property>
+		<name>dfs.namenode.name.dir</name>
+		<value>/tmp/run/hdfs/namenode</value>
+	</property>
+	<property>
+		<name>dfs.datanode.data.dir</name>
+		<value>/tmp/run/hdfs/datanode</value>
+	</property>
+</configuration>
diff --git a/dev/test-cluster/setup-cluster.sh b/dev/test-cluster/setup-cluster.sh
@@ -0,0 +1,42 @@
+#!/usr/bin/env bash
+
+WORK_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
+
+cd $WORK_DIR
+
+echo JAVA_HOME is $JAVA_HOME
+
+mkdir ~/opt
+cd ~/opt
+wget https://archive.apache.org/dist/spark/spark-3.0.0/spark-3.0.0-bin-hadoop2.7.tgz
+tar -xzf spark-3.0.0-bin-hadoop2.7.tgz
+wget https://archive.apache.org/dist/hadoop/core/hadoop-2.7.7/hadoop-2.7.7.tar.gz
+tar -xzf hadoop-2.7.7.tar.gz
+
+cd $WORK_DIR
+
+cp ./core-site.xml ~/opt/hadoop-2.7.7/etc/hadoop/
+cp ./hdfs-site.xml ~/opt/hadoop-2.7.7/etc/hadoop/
+cp ./yarn-site.xml ~/opt/hadoop-2.7.7/etc/hadoop/
+cp ./hadoop-env.sh ~/opt/hadoop-2.7.7/etc/hadoop/
+cp ./spark-defaults.conf ~/opt/spark-3.0.0-bin-hadoop2.7/conf
+
+# create directories
+mkdir -p /tmp/run/hdfs/namenode
+mkdir -p /tmp/run/hdfs/datanode
+
+# hdfs format
+~/opt/hadoop-2.7.7/bin/hdfs namenode -format
+
+export HADOOP_HOME=~/opt/hadoop-2.7.7
+export HADOOP_CONF_DIR=$HADOOP_HOME/etc/hadoop
+export SPARK_HOME=~/opt/spark-3.0.0-bin-hadoop2.7
+
+export PATH=$HADOOP_HOME/bin:$SPARK_HOME/bin:$PATH
+
+# start hdfs and yarn
+$HADOOP_HOME/sbin/start-dfs.sh
+$HADOOP_HOME/sbin/start-yarn.sh
+
+hadoop fs -ls /
+yarn node -list
diff --git a/dev/test-cluster/setup-python3-env.sh b/dev/test-cluster/setup-python3-env.sh
@@ -0,0 +1,12 @@
+#!/usr/bin/env bash
+
+sudo apt-get update
+sudo apt-get install python3-pip python3-setuptools python3-wheel
+
+pip3 install --user numpy
+
+echo python is in $(which python) 
+python --version
+
+echo python3 is in $(which python3) 
+python3 --version
diff --git a/dev/test-cluster/spark-defaults.conf b/dev/test-cluster/spark-defaults.conf
@@ -0,0 +1,34 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+# Default system properties included when running spark-submit.
+# This is useful for setting default environmental settings.
+
+# Example:
+# spark.master                     spark://master:7077
+# spark.eventLog.enabled           true
+# spark.eventLog.dir               hdfs://namenode:8021/directory
+# spark.serializer                 org.apache.spark.serializer.KryoSerializer
+# spark.driver.memory              5g
+# spark.executor.extraJavaOptions  -XX:+PrintGCDetails -Dkey=value -Dnumbers="one two three"
+
+spark.master                     yarn	
+spark.serializer                 org.apache.spark.serializer.KryoSerializer
+spark.driver.memory              3g
+spark.executor.num               2
+spark.executor.cores             1
+spark.executor.memory            4g
diff --git a/dev/test-cluster/workloads/kmeans-pyspark.py b/dev/test-cluster/workloads/kmeans-pyspark.py
@@ -0,0 +1,70 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+"""
+An example demonstrating k-means clustering.
+Run with:
+  bin/spark-submit examples/src/main/python/ml/kmeans_example.py
+
+This example requires NumPy (http://www.numpy.org/).
+"""
+from __future__ import print_function
+import sys
+
+# $example on$
+from pyspark.ml.clustering import KMeans
+from pyspark.ml.evaluation import ClusteringEvaluator
+# $example off$
+
+from pyspark.sql import SparkSession
+
+if __name__ == "__main__":
+    spark = SparkSession\
+        .builder\
+        .appName("KMeansExample")\
+        .getOrCreate()
+
+    if (len(sys.argv) != 2) :
+        println("Require data file path as input parameter")
+        sys.exit(1)
+
+    # $example on$
+    # Loads data.
+    dataset = spark.read.format("libsvm").load(sys.argv[1])
+
+    # Trains a k-means model.
+    kmeans = KMeans().setK(2).setSeed(1)
+    model = kmeans.fit(dataset)
+
+    # Make predictions
+    predictions = model.transform(dataset)
+
+    # Evaluate clustering by computing Silhouette score
+    evaluator = ClusteringEvaluator()
+
+    silhouette = evaluator.evaluate(predictions)
+    print("Silhouette with squared euclidean distance = " + str(silhouette))
+
+    # Shows the result.
+    centers = model.clusterCenters()
+    print("Cluster Centers: ")
+    for center in centers:
+        print(center)
+    # $example off$
+
+    spark.stop()
+
diff --git a/dev/test-cluster/workloads/run-kmeans-pyspark.sh b/dev/test-cluster/workloads/run-kmeans-pyspark.sh
@@ -0,0 +1,48 @@
+#!/usr/bin/env bash
+
+source ../envs.sh
+
+# Data file is from Spark Examples (data/mllib/sample_kmeans_data.txt), the data file should be copied to HDFS
+$HADOOP_HOME/bin/hadoop fs -mkdir -p $OAP_MLLIB_DATA_ROOT
+$HADOOP_HOME/bin/hadoop fs -copyFromLocal $SPARK_HOME/data/mllib/sample_kmeans_data.txt $OAP_MLLIB_DATA_ROOT
+
+# User should check the requested resources are acturally allocated by cluster manager or Intel MLlib will behave incorrectly
+SPARK_MASTER=yarn
+SPARK_DRIVER_MEMORY=1G
+SPARK_NUM_EXECUTORS=2
+SPARK_EXECUTOR_CORES=1
+SPARK_EXECUTOR_MEMORY=1G
+
+SPARK_DEFAULT_PARALLELISM=$(expr $SPARK_NUM_EXECUTORS '*' $SPARK_EXECUTOR_CORES '*' 2)
+
+# ======================================================= #
+
+# Check env
+if [[ -z $SPARK_HOME ]]; then
+    echo SPARK_HOME not defined!
+    exit 1
+fi
+
+if [[ -z $HADOOP_HOME ]]; then
+    echo HADOOP_HOME not defined!
+    exit 1
+fi
+
+APP_PY="$OAP_MLLIB_ROOT/dev/test-cluster/workloads/kmeans-pyspark.py"
+DATA_FILE=$OAP_MLLIB_DATA_ROOT/sample_kmeans_data.txt
+
+$SPARK_HOME/bin/spark-submit --master $SPARK_MASTER -v \
+    --num-executors $SPARK_NUM_EXECUTORS \
+    --driver-memory $SPARK_DRIVER_MEMORY \
+    --executor-cores $SPARK_EXECUTOR_CORES \
+    --executor-memory $SPARK_EXECUTOR_MEMORY \
+    --conf "spark.serializer=org.apache.spark.serializer.KryoSerializer" \
+    --conf "spark.default.parallelism=$SPARK_DEFAULT_PARALLELISM" \
+    --conf "spark.sql.shuffle.partitions=$SPARK_DEFAULT_PARALLELISM" \
+    --conf "spark.driver.extraClassPath=$SPARK_DRIVER_CLASSPATH" \
+    --conf "spark.executor.extraClassPath=$SPARK_EXECUTOR_CLASSPATH" \    
+    --conf "spark.shuffle.reduceLocality.enabled=false" \
+    --conf "spark.network.timeout=1200s" \
+    --conf "spark.task.maxFailures=1" \
+    --jars $OAP_MLLIB_JAR \
+    $APP_PY $DATA_FILE