diff --git a/.gitignore b/.gitignore new file mode 100644 index 000000000..89fe85c54 --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +# IDEA config files +.idea/ diff --git a/.travis.yml b/.travis.yml index a61af4db1..dcd42b2d6 100644 --- a/.travis.yml +++ b/.travis.yml @@ -7,24 +7,19 @@ jobs: #oap-cache-oap is a CI building demo of the corresponding module oap-cache/oap. - name: oap-cache-oap before_install: - - sudo apt-get install cmake - sudo apt-get install libpthread-stubs0-dev - sudo apt-get install libnuma-dev + - sudo apt-get install cmake install: - # Download spark 2.4.4 - "[ -f spark ] || mkdir spark && cd spark && wget http://archive.apache.org/dist/spark/spark-2.4.4/spark-2.4.4-bin-hadoop2.7.tgz && cd .." - "tar -xf ./spark/spark-2.4.4-bin-hadoop2.7.tgz" - "export SPARK_HOME=`pwd`/spark-2.4.4-bin-hadoop2.7" before_script: - - cd /tmp - - git clone https://github.com/memkind/memkind.git - - cd memkind && ./build.sh - - make - - sudo make install - - cd ${TRAVIS_BUILD_DIR} + - cd ${TRAVIS_BUILD_DIR}/dev + - ./install_vmemcache.sh + - ./install_arrow.sh + - ./install_memkind.sh script: - - cd ${TRAVIS_BUILD_DIR}/oap-cache/oap/ - - mvn clean -q -Ppersistent-memory test - - - + - cd ${TRAVIS_BUILD_DIR} + - mvn clean -q -Ppersistent-memory -Pvmemcache test \ No newline at end of file diff --git a/README.md b/README.md index 923ce6b6a..dca51940b 100644 --- a/README.md +++ b/README.md @@ -1,8 +1,13 @@ -OAP - Optimized Analytics Packages for Spark is a project to optimize Spark by providing optimized implmentation of packages in various aspects including cache, native SQL engine, shuffle, data source and so on. +OAP - Optimized Analytics Packages for Spark is a project to optimize Spark by providing optimized implmentation of packages in various aspects including cache, shuffle, and so on. Please refer the below documents for more information. * [Cache](./oap-cache/README.md) -* [Native SQL Engine](./oap-native-sql/README.md) * [Shuffle](./oap-shuffle/README.md) -* [Data Sources](./oap-data-source/README.md) + + +## Building OAP + +Please refer the below documents for more information. + +* [Dev](./dev/README.md) \ No newline at end of file diff --git a/dev/README.md b/dev/README.md new file mode 100644 index 000000000..0a1d724cb --- /dev/null +++ b/dev/README.md @@ -0,0 +1,9 @@ +# OAP Developer Scripts +This directory contains scripts useful to developers when packaging, testing. + +## Build OAP + +Build the project using the following command and choose build type according to your needs.All jars will generate in path dev/target/ +``` + sh make-distribution.sh +``` \ No newline at end of file diff --git a/dev/install_arrow.sh b/dev/install_arrow.sh new file mode 100755 index 000000000..37bd0b2b8 --- /dev/null +++ b/dev/install_arrow.sh @@ -0,0 +1,14 @@ +#install arrow and plasms +cd /tmp +git clone https://github.com/Intel-bigdata/arrow.git +cd arrow && git checkout oap-master +cd cpp +rm -rf release +mkdir release +cd release +#build libarrow, libplasma, libplasma_java +cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_C_FLAGS="-g -O3" -DCMAKE_CXX_FLAGS="-g -O3" -DARROW_BUILD_TESTS=on -DARROW_PLASMA_JAVA_CLIENT=on -DARROW_PLASMA=on -DARROW_DEPENDENCY_SOURCE=BUNDLED .. +make -j$(nproc) +sudo make install -j$(nproc) +cd ../../java +mvn clean -q -DskipTests install \ No newline at end of file diff --git a/dev/install_memkind.sh b/dev/install_memkind.sh new file mode 100755 index 000000000..808c7b5a0 --- /dev/null +++ b/dev/install_memkind.sh @@ -0,0 +1,6 @@ +#install memkind +cd /tmp +git clone https://github.com/memkind/memkind.git +cd memkind && ./build.sh +make +sudo make install \ No newline at end of file diff --git a/dev/install_vmemcache.sh b/dev/install_vmemcache.sh new file mode 100755 index 000000000..2ba4c6dcb --- /dev/null +++ b/dev/install_vmemcache.sh @@ -0,0 +1,10 @@ +#install vemecache +cd /tmp +git clone https://github.com/pmem/vmemcache.git +pushd vmemcache +mkdir build +cd build +cmake .. -DCMAKE_INSTALL_PREFIX=/usr -DCPACK_GENERATOR=deb +make package +sudo dpkg -i libvmemcache*.deb +popd \ No newline at end of file diff --git a/dev/make-distribution.sh b/dev/make-distribution.sh new file mode 100644 index 000000000..4ba3993de --- /dev/null +++ b/dev/make-distribution.sh @@ -0,0 +1,174 @@ +#!/bin/bash + +# set -e +MAVEN_TARGET_VERSION=3.6.3 + +CMAKE_TARGET_VERSION=3.7.1 +CMAKE_MIN_VERSION=3.3 +TARGET_CMAKE_SOURCE_URL=https://cmake.org/files/v3.7/cmake-3.7.1.tar.gz + +dev_path=$(pwd) + +function version_lt() { test "$(echo "$@" | tr " " "\n" | sort -rV | head -n 1)" != "$1"; } + +function version_ge() { test "$(echo "$@" | tr " " "\n" | sort -rV | head -n 1)" == "$1"; } + +function prepare_maven() { + echo "Check maven version......" + CURRENT_MAVEN_VERSION_STR="$(mvn --version)" + if [[ "$CURRENT_MAVEN_VERSION_STR" == "Apache Maven"* ]]; then + echo "mvn is installed" + else + echo "mvn is not installed" + wget https://mirrors.cnnic.cn/apache/maven/maven-3/$MAVEN_TARGET_VERSION/binaries/apache-maven-$MAVEN_TARGET_VERSION-bin.tar.gz + mkdir -p /usr/local/maven + tar -xzvf apache-maven-$MAVEN_TARGET_VERSION-bin.tar.gz + mv apache-maven-$MAVEN_TARGET_VERSION/* /usr/local/maven + echo 'export MAVEN_HOME=/usr/local/maven' >>env.sh + echo 'export PATH=$MAVEN_HOME/bin:$PATH' >>env.sh + echo "Please source env.sh or copy it's contents to /etc/profile and source /etc/profile!" + export MAVEN_HOME=/usr/local/maven + export PATH=$MAVEN_HOME/bin:$PATH + rm -rf apache-maven* + fi +} + +function prepare_cmake() { + CURRENT_CMAKE_VERSION_STR="$(cmake --version)" + cd $dev_path + + # echo ${CURRENT_CMAKE_VERSION_STR} + if [[ "$CURRENT_CMAKE_VERSION_STR" == "cmake version"* ]]; then + echo "cmake is installed" + array=(${CURRENT_CMAKE_VERSION_STR//,/ }) + CURRENT_CMAKE_VERSION=${array[2]} + if version_lt $CURRENT_CMAKE_VERSION $CMAKE_MIN_VERSION; then + echo "$CURRENT_CMAKE_VERSION is less than $CMAKE_MIN_VERSION,install cmake $CMAKE_TARGET_VERSION" + mkdir -p thirdparty + cd thirdparty + echo "$dev_path/thirdparty/cmake-$CMAKE_TARGET_VERSION.tar.gz" + if [ ! -f "$dev_path/thirdparty/cmake-$CMAKE_TARGET_VERSION.tar.gz" ]; then + wget $TARGET_CMAKE_SOURCE_URL + fi + tar xvf cmake-$CMAKE_TARGET_VERSION.tar.gz + cd cmake-$CMAKE_TARGET_VERSION/ + ./bootstrap + gmake + gmake install + yum remove cmake -y + ln -s /usr/local/bin/cmake /usr/bin/ + cd $dev_path + fi + else + echo "cmake is not installed" + mkdir -p thirdparty + cd thirdparty + echo "$dev_path/thirdparty/cmake-$CMAKE_TARGET_VERSION.tar.gz" + if [ ! -f "cmake-$CMAKE_TARGET_VERSION.tar.gz" ]; then + wget $TARGET_CMAKE_SOURCE_URL + fi + + tar xvf cmake-$CMAKE_TARGET_VERSION.tar.gz + cd cmake-$CMAKE_TARGET_VERSION/ + ./bootstrap + gmake + gmake install + cd $dev_path + fi +} + +function prepare_memkind() { + memkind_repo="https://github.com/memkind/memkind.git" + echo $memkind_repo + + mkdir -p thirdparty + cd thirdparty + if [ ! -d "memkind" ]; then + git clone $memkind_repo + fi + cd memkind/ + + yum -y install autoconf + yum -y install automake + yum -y install gcc-c++ + yum -y install libtool + yum -y install numactl-devel + yum -y install unzip + yum -y install libnuma-devel + + ./autogen.sh + ./configure + make + make install + cd $dev_path + +} + +function prepare_vmemcache() { + if [ -n "$(rpm -qa | grep libvmemcache)" ]; then + echo "libvmemcache is installed" + return + fi + vmemcache_repo="https://github.com/pmem/vmemcache.git" + prepare_cmake + cd $dev_path + mkdir -p thirdparty + cd thirdparty + if [ ! -d "vmemcache" ]; then + git clone $vmemcache_repo + fi + cd vmemcache + mkdir -p build + cd build + yum -y install rpm-build + cmake .. -DCMAKE_INSTALL_PREFIX=/usr -DCPACK_GENERATOR=rpm + make package + sudo rpm -i libvmemcache*.rpm +} + +function gather() { + cd $dev_path + mkdir -p target + cp ../oap-cache/oap/target/*.jar target/ + cp ../oap-shuffle/remote-shuffle/target/*.jar target/ + cp ../oap-common/target/*.jar target/ + echo "Please check the result in $dev_path/target !" +} + + + + +function prepare_intel_arrow() { + yum -y install libpthread-stubs0-dev + yum -y install libnuma-dev + + #install vemecache + prepare_vmemcache + + #install arrow and plasms + cd $dev_path/thirdparty + if [ ! -d "arrow" ]; then + git clone https://github.com/Intel-bigdata/arrow.git -b oap-master + fi + + cd arrow/cpp + rm -rf release + mkdir -p release + cd release + + #build libarrow, libplasma, libplasma_java + cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_C_FLAGS="-g -O3" -DCMAKE_CXX_FLAGS="-g -O3" -DARROW_BUILD_TESTS=on -DARROW_PLASMA_JAVA_CLIENT=on -DARROW_PLASMA=on -DARROW_DEPENDENCY_SOURCE=BUNDLED .. + make -j$(nproc) + make install -j$(nproc) +} + + +prepare_maven +prepare_memkind +prepare_cmake +prepare_vmemcache +prepare_intel_arrow +cd $dev_path +cd .. +mvn clean -q -Ppersistent-memory -Pvmemcache -DskipTests package +gather \ No newline at end of file diff --git a/oap-cache/oap/.gitignore b/oap-cache/oap/.gitignore index 28de7b3a3..c0267ea64 100644 --- a/oap-cache/oap/.gitignore +++ b/oap-cache/oap/.gitignore @@ -24,5 +24,4 @@ dependency-reduced-pom.xml derby.log metastore_db/ null/ -src/main/native/build conf/persistent-memory.xml diff --git a/oap-cache/oap/.travis.yml b/oap-cache/oap/.travis.yml new file mode 100644 index 000000000..a61af4db1 --- /dev/null +++ b/oap-cache/oap/.travis.yml @@ -0,0 +1,30 @@ +sudo: required +dist: trusty +language: java +jobs: + include: + #Other modules can refer to oap-cache-oap to build independent travis-ci job, + #oap-cache-oap is a CI building demo of the corresponding module oap-cache/oap. + - name: oap-cache-oap + before_install: + - sudo apt-get install cmake + - sudo apt-get install libpthread-stubs0-dev + - sudo apt-get install libnuma-dev + install: + - # Download spark 2.4.4 + - "[ -f spark ] || mkdir spark && cd spark && wget http://archive.apache.org/dist/spark/spark-2.4.4/spark-2.4.4-bin-hadoop2.7.tgz && cd .." + - "tar -xf ./spark/spark-2.4.4-bin-hadoop2.7.tgz" + - "export SPARK_HOME=`pwd`/spark-2.4.4-bin-hadoop2.7" + before_script: + - cd /tmp + - git clone https://github.com/memkind/memkind.git + - cd memkind && ./build.sh + - make + - sudo make install + - cd ${TRAVIS_BUILD_DIR} + script: + - cd ${TRAVIS_BUILD_DIR}/oap-cache/oap/ + - mvn clean -q -Ppersistent-memory test + + + diff --git a/oap-cache/oap/checkstyle.xml b/oap-cache/oap/checkstyle.xml index 870232c31..f526fceb3 100644 --- a/oap-cache/oap/checkstyle.xml +++ b/oap-cache/oap/checkstyle.xml @@ -49,7 +49,7 @@ - + diff --git a/oap-cache/oap/docs/OAP-User-Guide.md b/oap-cache/oap/docs/OAP-User-Guide.md index 1014a4c2b..f1a51e0f9 100644 --- a/oap-cache/oap/docs/OAP-User-Guide.md +++ b/oap-cache/oap/docs/OAP-User-Guide.md @@ -255,10 +255,14 @@ You need to change the value for spark.executor.instances, spark.sql.oap.fiberCa Guava cache is based on memkind library, built on top of jemalloc and provides memory characteristics. To use it in your workload, follow [prerequisites](#prerequisites-1) to set up DCPMM hardware and memkind library correctly. Then follow bellow configurations. +Memkind library also support DAX KMEM mode. Refer [Kernel](https://github.com/memkind/memkind#kernel), this chapter will guide how to configure persistent memory as system ram. Or [Memkind support for KMEM DAX option](https://pmem.io/2020/01/20/memkind-dax-kmem.html) for more details. + +Please note that DAX KMEM mode need kernel version 5.x and memkind version 1.10 or above. + For Parquet data format, provides following conf options: ``` spark.sql.oap.parquet.data.cache.enable true -spark.sql.oap.fiberCache.memory.manager pm +spark.sql.oap.fiberCache.memory.manager pm / kmem spark.oap.cache.strategy guava spark.sql.oap.fiberCache.persistent.memory.initial.size *g spark.sql.extensions org.apache.spark.sql.OapExtensions @@ -268,7 +272,7 @@ For Orc data format, provides following conf options: spark.sql.orc.copyBatchToSpark true spark.sql.oap.orc.data.cache.enable true spark.sql.oap.orc.enable true -spark.sql.oap.fiberCache.memory.manager pm +spark.sql.oap.fiberCache.memory.manager pm / kmem spark.oap.cache.strategy guava spark.sql.oap.fiberCache.persistent.memory.initial.size *g spark.sql.extensions org.apache.spark.sql.OapExtensions @@ -283,7 +287,6 @@ To apply Non-evictable cache strategy in your workload, please follow [prerequis For Parquet data format, provides following conf options: ``` spark.sql.oap.parquet.data.cache.enable true -spark.sql.oap.fiberCache.memory.manager hybrid spark.oap.cache.strategy noevict spark.sql.oap.fiberCache.persistent.memory.initial.size 256g ``` @@ -291,7 +294,6 @@ For Orc data format, provides following conf options: ``` spark.sql.orc.copyBatchToSpark true spark.sql.oap.orc.data.cache.enable true -spark.sql.oap.fiberCache.memory.manager hybrid spark.oap.cache.strategy noevict spark.sql.oap.fiberCache.persistent.memory.initial.size 256g ``` @@ -305,7 +307,6 @@ For Parquet data format, provides following conf options: ``` spark.sql.oap.parquet.data.cache.enable true -spark.sql.oap.fiberCache.memory.manager tmp spark.oap.cache.strategy vmem spark.sql.oap.fiberCache.persistent.memory.initial.size 256g spark.sql.oap.cache.guardian.memory.size 10g # according to your cluster @@ -316,10 +317,10 @@ For Orc data format, provides following conf options: ``` spark.sql.orc.copyBatchToSpark true spark.sql.oap.orc.data.cache.enable true -spark.sql.oap.fiberCache.memory.manager tmp spark.oap.cache.strategy vmem spark.sql.oap.fiberCache.persistent.memory.initial.size 256g spark.sql.oap.cache.guardian.memory.size 10g # according to your cluster + ``` Note: If "PendingFiber Size" (on spark web-UI OAP page) is large, or some tasks failed due to "cache guardian use too much memory", user could set `spark.sql.oap.cache.guardian.memory.size ` to a larger number, and the default size is 10GB. Besides, user could increase `spark.sql.oap.cache.guardian.free.thread.nums` or decrease `spark.sql.oap.cache.dispose.timeout.ms` to accelerate memory free. @@ -355,7 +356,6 @@ spark.sql.oap.index.data.cache.separation.enable true spark.oap.cache.strategy mix spark.sql.oap.fiberCache.memory.manager mix spark.sql.oap.mix.index.memory.manager offheap -spark.sql.oap.mix.data.memory.manager tmp spark.sql.oap.mix.index.cache.backend guava spark.sql.oap.mix.data.cache.backend vmem ``` @@ -367,6 +367,18 @@ spark.sql.oap.parquet.data.cache.enable false # for Column spark.sql.oap.orc.binary.cache.enable true # for orc fileformat spark.sql.oap.orc.data.cache.enable false # for ColumnVector, default is false ``` +#### Use External cache strategy + +OAP supports arrow-plasma as external cache now and will support more other types in the future.[Plasma](http://arrow.apache.org/blog/2017/08/08/plasma-in-memory-object-store/) is a high-performance shared-memory object store. + +Provide the following conf options: + +``` +--conf spark.oap.cache.strategy=external +--conf spark.sql.oap.cache.external.client.pool.size=30 +``` +[Apache Arrow](https://github.com/apache/arrow) source code is modified to support DCPMM.Here's the modified [repo](https://github.com/Intel-bigdata/arrow). + #### Verify DCPMM cache functionality diff --git a/oap-cache/oap/pom.xml b/oap-cache/oap/pom.xml index 7c0f4a23e..82fecd38d 100644 --- a/oap-cache/oap/pom.xml +++ b/oap-cache/oap/pom.xml @@ -15,20 +15,19 @@ xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> 4.0.0 - com.intel.ssg.bdt - oap - 0.7.0 + + com.intel + oap + 0.8.0 + ../../pom.xml + + + oap-cache-oap jar - OAP - https://github.com/Intel-bigdata/OAP + OAP-CACHE-OAP + - - - The Apache Software License, Version 2.0 - http://www.apache.org/licenses/LICENSE-2.0.txt - - @@ -114,6 +113,11 @@ + + com.intel + oap-common + ${project.version} + org.eclipse.jetty jetty-plus @@ -261,6 +265,11 @@ test + + org.apache.arrow + arrow-plasma + 1.0.0-SNAPSHOT + org.antlr antlr4-runtime @@ -407,12 +416,13 @@ with-spark-${spark.internal.version} + com.intel:oap-common org.spark-project.spark:unused com.google.guava:guava org.apache.parquet:* org.apache.orc:* io.airlift:* - org.reflections:reflections + org.reflections:reflections @@ -458,7 +468,7 @@ src/test/scala - scalastyle-config.xml + ${basedir}/scalastyle-config.xml target/scalastyle-output.xml UTF-8 UTF-8 @@ -636,7 +646,7 @@ src/test/java src/test/spark2.4.4/java - checkstyle.xml + ${basedir}/checkstyle.xml ${basedir}/target/checkstyle-output.xml ${project.build.sourceEncoding} ${project.reporting.outputEncoding} @@ -723,89 +733,4 @@ - - - vmemcache - - - - org.apache.maven.plugins - maven-surefire-plugin - - false - - - - org.apache.maven.plugins - maven-antrun-plugin - ${maven-antrun-plugin.version} - - - build-vmemcahe-native - generate-resources - - run - - - - - - - - - - - - - clean-native - clean - - run - - - - - - - - - - - - - - - - persistent-memory - - - - src/main/resources - - - ${basedir}/conf - - - - - exec-maven-plugin - org.codehaus.mojo - ${exec.maven.version} - - - Compile native code and produce share library - generate-resources - - exec - - - ${basedir}/src/main/native/compile.sh - - - - - - - - diff --git a/oap-cache/oap/src/main/java/org/apache/spark/unsafe/NativeLoader.java b/oap-cache/oap/src/main/java/org/apache/spark/unsafe/NativeLoader.java deleted file mode 100644 index 6977e9471..000000000 --- a/oap-cache/oap/src/main/java/org/apache/spark/unsafe/NativeLoader.java +++ /dev/null @@ -1,124 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - - -package org.apache.spark.unsafe; - -import java.io.*; - -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - - -public class NativeLoader { - private static final Logger logger = LoggerFactory.getLogger(NativeLoader.class); - - public static void loadLibrary(String libName) { - assertOsArchSupport(); - try { - logger.info("Trying to load library " + libName + " from system library path."); - logger.info("system library path:" + System.getProperty("java.library.path") - + System.getProperty("user.dir")); - System.loadLibrary(libName); - logger.info("load libvmemcachejni succeed."); - return; - } catch (UnsatisfiedLinkError e) { - logger.info("load from system library path failed and will try to load from package."); - } - logger.info("Trying to load library " + libName + " from package."); - loadFromPackage(libName); - } - - private static void loadFromPackage(String libName) { - String fullName = appendPrefixAndSuffix(libName); - String path = "/lib/linux64/" + fullName; - logger.info("library path is " + path); - InputStream input = NativeLoader.class.getResourceAsStream(path); - if (input == null) { - throw new RuntimeException("The library " + path + " doesn't exist"); - } - - File tmpFile = null; - OutputStream output = null; - try { - tmpFile = File.createTempFile("lib", libName + ".so.tmp"); - } catch (IOException e) { - throw new RuntimeException(e); - } - - try { - output = new FileOutputStream(tmpFile); - byte[] buffer = new byte[1024]; - int len = -1; - while ((len = input.read(buffer)) != -1) { - output.write(buffer, 0, len); - } - - try { - output.flush(); - output.close(); - } catch (Exception e) { - // ignore it - } - - System.load(tmpFile.getCanonicalPath()); - } catch (FileNotFoundException e) { - throw new RuntimeException(e); - } catch (IOException e) { - throw new RuntimeException(e); - } catch (UnsatisfiedLinkError e) { - throw new RuntimeException(e); - } finally { - if (input != null) { - try { - input.close(); - input = null; - } catch (Exception e) { - // ignore it - } - } - - if (output != null) { - try { - output.close(); - output = null; - } catch (Exception e) { - // ignore it - } - } - - if (tmpFile != null && tmpFile.exists()) { - tmpFile.delete(); - tmpFile = null; - } - } - } - - private static void assertOsArchSupport() { - String osProp = System.getProperty("os.name"); - String archProp = System.getProperty("os.arch"); - if (!osProp.contains("Linux") && !archProp.contains("64")) { - throw new UnsupportedOperationException("We only tested on linux64. It doesn't support on " - + osProp + archProp + "currently"); - } - } - - private static String appendPrefixAndSuffix(String libName) { - // Currently, we only support linux64 - return "lib" + libName + ".so"; - } -} diff --git a/oap-cache/oap/src/main/native/org_apache_spark_unsafe_PersistentMemoryPlatform.h b/oap-cache/oap/src/main/native/org_apache_spark_unsafe_PersistentMemoryPlatform.h deleted file mode 100644 index c83d5baee..000000000 --- a/oap-cache/oap/src/main/native/org_apache_spark_unsafe_PersistentMemoryPlatform.h +++ /dev/null @@ -1,62 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* DO NOT EDIT THIS FILE - it is machine generated */ -#include -/* Header for class org_apache_spark_unsafe_PersistentMemoryPlatform */ - -#ifndef _Included_org_apache_spark_unsafe_PersistentMemoryPlatform -#define _Included_org_apache_spark_unsafe_PersistentMemoryPlatform -#ifdef __cplusplus -extern "C" { -#endif -/* - * Class: org_apache_spark_unsafe_PersistentMemoryPlatform - * Method: initializeNative - * Signature: (Ljava/lang/String;J)V - */ -JNIEXPORT void JNICALL Java_org_apache_spark_unsafe_PersistentMemoryPlatform_initializeNative - (JNIEnv *, jclass, jstring, jlong, jint); - -/* - * Class: org_apache_spark_unsafe_PersistentMemoryPlatform - * Method: allocateMemory - * Signature: (J)J - */ -JNIEXPORT jlong JNICALL Java_org_apache_spark_unsafe_PersistentMemoryPlatform_allocateVolatileMemory - (JNIEnv *, jclass, jlong); - -/* - * Class: org_apache_spark_unsafe_PersistentMemoryPlatform - * Method: getOccupiedSize - * Signature: (J)J - */ -JNIEXPORT jlong JNICALL Java_org_apache_spark_unsafe_PersistentMemoryPlatform_getOccupiedSize - (JNIEnv *, jclass, jlong); - -/* - * Class: org_apache_spark_unsafe_PersistentMemoryPlatform - * Method: freeMemory - * Signature: (J)V - */ -JNIEXPORT void JNICALL Java_org_apache_spark_unsafe_PersistentMemoryPlatform_freeMemory - (JNIEnv *, jclass, jlong); - -#ifdef __cplusplus -} -#endif -#endif diff --git a/oap-cache/oap/src/main/scala/org/apache/spark/sql/execution/datasources/oap/filecache/FiberCacheManager.scala b/oap-cache/oap/src/main/scala/org/apache/spark/sql/execution/datasources/oap/filecache/FiberCacheManager.scala index 3a6021d0c..7f5eb6e14 100644 --- a/oap-cache/oap/src/main/scala/org/apache/spark/sql/execution/datasources/oap/filecache/FiberCacheManager.scala +++ b/oap-cache/oap/src/main/scala/org/apache/spark/sql/execution/datasources/oap/filecache/FiberCacheManager.scala @@ -17,7 +17,7 @@ package org.apache.spark.sql.execution.datasources.oap.filecache -import java.util.concurrent.{ConcurrentHashMap, TimeUnit} +import java.util.concurrent.{ConcurrentHashMap, Executors, LinkedBlockingQueue, TimeUnit} import java.util.concurrent.atomic.AtomicLong import java.util.concurrent.locks.ReentrantReadWriteLock @@ -42,6 +42,7 @@ private[sql] class FiberCacheManager( private val NO_EVICT_CACHE = "noevict" private val VMEM_CACHE = "vmem" private val MIX_CACHE = "mix" + private val EXTERNAL_CACHE = "external" private val DEFAULT_CACHE_STRATEGY = GUAVA_CACHE private var _dataCacheCompressEnable = sparkEnv.conf.get( @@ -79,9 +80,11 @@ private[sql] class FiberCacheManager( } else if (cacheName.equals(SIMPLE_CACHE)) { new SimpleOapCache() } else if (cacheName.equals(NO_EVICT_CACHE)) { - new NonEvictPMCache(dataCacheMemorySize, dataCacheGuardianMemorySize, FiberType.DATA) + new NoEvictPMCache(dataCacheMemorySize, dataCacheGuardianMemorySize, FiberType.DATA) } else if (cacheName.equals(VMEM_CACHE)) { new VMemCache(FiberType.DATA) + } else if (cacheName.equals(EXTERNAL_CACHE)) { + new ExternalCache(FiberType.DATA) } else if (cacheName.equals(MIX_CACHE)) { val separateCache = sparkEnv.conf.getBoolean( OapConf.OAP_INDEX_DATA_SEPARATION_ENABLE.key, diff --git a/oap-cache/oap/src/main/scala/org/apache/spark/sql/execution/datasources/oap/filecache/MemoryManager.scala b/oap-cache/oap/src/main/scala/org/apache/spark/sql/execution/datasources/oap/filecache/MemoryManager.scala index 1b40c825c..d4ef373ed 100644 --- a/oap-cache/oap/src/main/scala/org/apache/spark/sql/execution/datasources/oap/filecache/MemoryManager.scala +++ b/oap-cache/oap/src/main/scala/org/apache/spark/sql/execution/datasources/oap/filecache/MemoryManager.scala @@ -22,6 +22,8 @@ import java.util.concurrent.atomic.AtomicLong import scala.util.Success +import com.intel.oap.common.unsafe.PersistentMemoryPlatform + import org.apache.spark.SparkEnv import org.apache.spark.internal.Logging import org.apache.spark.internal.config.ConfigEntry @@ -31,7 +33,7 @@ import org.apache.spark.sql.execution.datasources.oap.filecache.FiberType.FiberT import org.apache.spark.sql.execution.datasources.oap.utils.PersistentMemoryConfigUtils import org.apache.spark.sql.internal.oap.OapConf import org.apache.spark.storage.{BlockManager, TestBlockId} -import org.apache.spark.unsafe.{PersistentMemoryPlatform, Platform, VMEMCacheJNI} +import org.apache.spark.unsafe.Platform import org.apache.spark.util.Utils object SourceEnum extends Enumeration { @@ -91,6 +93,31 @@ private[sql] object MemoryManager extends Logging { */ private[filecache] val DUMMY_BLOCK_ID = TestBlockId("oap_memory_request_block") + private def checkConfCompatibility(cacheStrategy: String, memoryManagerOpt: String): Unit = { + cacheStrategy match { + case "guava" => + if (!(memoryManagerOpt.equals("pm")||memoryManagerOpt.equals("offheap"))) { + throw new UnsupportedOperationException(s"For cache strategy" + + s" ${cacheStrategy}, memorymanager should be 'offheap' or 'pm'" + + s" but not ${memoryManagerOpt}.") + } + case "vmem" => + if (!memoryManagerOpt.equals("tmp")) { + logWarning(s"current spark.sql.oap.fiberCache.memory.manager: ${memoryManagerOpt} " + + "takes no effect, use 'tmp' as memory manager for vmem cache instead.") + } + case "noevict" => + if (!memoryManagerOpt.equals("hybrid")) { + logWarning(s"current spark.sql.oap.fiberCache.memory.manager: ${memoryManagerOpt} " + + "takes no effect, use 'hybrid' as memory manager for noevict cache instead.") + } + case _ => + logInfo("current cache type may need further compatibility" + + " check against backend cache strategy and memory manager. " + + "Please refer enabling-indexdata-cache-separation part in OAP-User-Guide.md.") + } + } + def apply(sparkEnv: SparkEnv): MemoryManager = { apply(sparkEnv, OapConf.OAP_FIBERCACHE_STRATEGY, FiberType.DATA) } @@ -102,6 +129,7 @@ private[sql] object MemoryManager extends Logging { case "pm" => new PersistentMemoryManager(sparkEnv) case "hybrid" => new HybridMemoryManager(sparkEnv) case "tmp" => new TmpDramMemoryManager(sparkEnv) + case "kmem" => new DaxKmemMemoryManager(sparkEnv) case _ => throw new UnsupportedOperationException( s"The memory manager: ${memoryManagerOpt} is not supported now") } @@ -116,38 +144,34 @@ private[sql] object MemoryManager extends Logging { configEntry.defaultValue.get).toLowerCase val memoryManagerOpt = conf.get(OapConf.OAP_FIBERCACHE_MEMORY_MANAGER.key, "offheap").toLowerCase + checkConfCompatibility(cacheStrategyOpt, memoryManagerOpt) cacheStrategyOpt match { - case "guava" => - memoryManagerOpt match { - case "offheap" | "pm" => apply(sparkEnv, memoryManagerOpt) - case _ => throw new UnsupportedOperationException(s"For cache strategy" + - s" ${cacheStrategyOpt}, memorymanager should be 'offheap' or 'pm'" + - s" but not ${memoryManagerOpt}.") - } - case "nonevict" => - if (!memoryManagerOpt.equals("hybrid")) { - logWarning(s"current spark.sql.oap.fiberCache.memory.manager: ${memoryManagerOpt} " + - "takes no effect, use 'hybrid' as memory manager for nonevict cache instead.") - } - new HybridMemoryManager(sparkEnv) - case "vmem" => - if (!memoryManagerOpt.equals("tmp")) { - logWarning(s"current spark.sql.oap.fiberCache.memory.manager: ${memoryManagerOpt} " + - "takes no effect, use 'tmp' as memory manager for vmem cache instead.") - } - new TmpDramMemoryManager(sparkEnv) + case "guava" => apply(sparkEnv, memoryManagerOpt) + case "noevict" => new HybridMemoryManager(sparkEnv) + case "vmem" => new TmpDramMemoryManager(sparkEnv) case "mix" => - fiberType match { - case FiberType.DATA => - val dataMemoryManagerOpt = - conf.get(OapConf.OAP_MIX_DATA_MEMORY_MANAGER.key, "pm").toLowerCase - apply(sparkEnv, dataMemoryManagerOpt) - case FiberType.INDEX => - val indexMemoryManagerOpt = - conf.get(OapConf.OAP_MIX_INDEX_MEMORY_MANAGER.key, "offheap").toLowerCase - apply(sparkEnv, indexMemoryManagerOpt) - case _ => - null + if (!memoryManagerOpt.equals("mix")) { + apply(sparkEnv, memoryManagerOpt) + } else { + var cacheBackendOpt = "" + var mixMemoryMangerOpt = "" + fiberType match { + case FiberType.DATA => + cacheBackendOpt = + conf.get(OapConf.OAP_MIX_DATA_CACHE_BACKEND.key, "guava").toLowerCase + mixMemoryMangerOpt = + conf.get(OapConf.OAP_MIX_DATA_MEMORY_MANAGER.key, "pm").toLowerCase + case FiberType.INDEX => + cacheBackendOpt = + conf.get(OapConf.OAP_MIX_INDEX_CACHE_BACKEND.key, "guava").toLowerCase + mixMemoryMangerOpt = + conf.get(OapConf.OAP_MIX_INDEX_MEMORY_MANAGER.key, "offheap").toLowerCase + } + checkConfCompatibility(cacheBackendOpt, mixMemoryMangerOpt) + cacheBackendOpt match { + case "vmem" => new TmpDramMemoryManager(sparkEnv) + case _ => apply(sparkEnv, mixMemoryMangerOpt) + } } case _ => throw new UnsupportedOperationException( s"The cache strategy: ${cacheStrategyOpt} is not supported now") @@ -342,6 +366,45 @@ private[filecache] class PersistentMemoryManager(sparkEnv: SparkEnv) override def isDcpmmUsed(): Boolean = {true} } +private[filecache] class DaxKmemMemoryManager(sparkEnv: SparkEnv) + extends PersistentMemoryManager(sparkEnv) with Logging { + + private val _memorySize = init() + + private def init(): Long = { + val conf = sparkEnv.conf + + val numaId = conf.getInt("spark.executor.numa.id", -1) + if (numaId == -1) { + throw new OapException("DAX KMEM mode is strongly related to numa node. " + + "Please enable numa binding") + } + + val map = PersistentMemoryConfigUtils.parseConfig(conf) + val regularNodeNum = map.size + val daxNodeId = numaId + regularNodeNum + + val (kmemCacheMemory, kmemCacheMemoryReserverd) = { + (Utils.byteStringAsBytes( + conf.get(OapConf.OAP_FIBERCACHE_PERSISTENT_MEMORY_INITIAL_SIZE).trim), + Utils.byteStringAsBytes( + conf.get(OapConf.OAP_FIBERCACHE_PERSISTENT_MEMORY_RESERVED_SIZE).trim)) + } + require(kmemCacheMemoryReserverd >= 0 && kmemCacheMemoryReserverd < kmemCacheMemory, + s"Reserved size(${kmemCacheMemoryReserverd}) should greater than zero and less than " + + s"initial size(${kmemCacheMemory})" + ) + PersistentMemoryPlatform.setNUMANode(String.valueOf(daxNodeId), String.valueOf(numaId)) + PersistentMemoryPlatform.initialize() + logInfo(s"Running DAX KMEM mode, will use ${kmemCacheMemory} as cache memory, " + + s"reserve $kmemCacheMemoryReserverd") + kmemCacheMemory - kmemCacheMemoryReserverd + } + + override def memorySize: Long = _memorySize + +} + private[filecache] class HybridMemoryManager(sparkEnv: SparkEnv) extends MemoryManager with Logging { private val (persistentMemoryManager, dramMemoryManager) = diff --git a/oap-cache/oap/src/main/scala/org/apache/spark/sql/execution/datasources/oap/filecache/OapCache.scala b/oap-cache/oap/src/main/scala/org/apache/spark/sql/execution/datasources/oap/filecache/OapCache.scala index 7bbe5772e..15614ed7d 100644 --- a/oap-cache/oap/src/main/scala/org/apache/spark/sql/execution/datasources/oap/filecache/OapCache.scala +++ b/oap-cache/oap/src/main/scala/org/apache/spark/sql/execution/datasources/oap/filecache/OapCache.scala @@ -18,6 +18,7 @@ package org.apache.spark.sql.execution.datasources.oap.filecache import java.io.File +import java.nio.{ByteBuffer, DirectByteBuffer} import java.util.concurrent.{ConcurrentHashMap, Executors, LinkedBlockingQueue} import java.util.concurrent.atomic.{AtomicInteger, AtomicLong} import java.util.concurrent.locks.{Condition, ReentrantLock} @@ -25,6 +26,11 @@ import java.util.concurrent.locks.{Condition, ReentrantLock} import scala.collection.JavaConverters._ import com.google.common.cache._ +import com.google.common.hash._ +import com.intel.oap.common.unsafe.VMEMCacheJNI +import org.apache.arrow.plasma +import org.apache.arrow.plasma.exceptions.{DuplicateObjectException, PlasmaClientException} +import sun.nio.ch.DirectBuffer import org.apache.spark.{SparkEnv, SparkException} import org.apache.spark.internal.Logging @@ -34,7 +40,7 @@ import org.apache.spark.sql.execution.datasources.oap.filecache.FiberType.FiberT import org.apache.spark.sql.execution.datasources.oap.utils.PersistentMemoryConfigUtils import org.apache.spark.sql.internal.oap.OapConf import org.apache.spark.sql.oap.OapRuntime -import org.apache.spark.unsafe.VMEMCacheJNI +import org.apache.spark.unsafe.Platform import org.apache.spark.util.Utils private[filecache] class MultiThreadCacheGuardian(maxMemory: Long) extends CacheGuardian(maxMemory) @@ -221,7 +227,8 @@ private[filecache] object OapCache { case "guava" => new GuavaOapCache(cacheMemory, cacheGuardianMemory, fiberType) case "vmem" => new VMemCache(fiberType) case "simple" => new SimpleOapCache() - case "noevict" => new NonEvictPMCache(cacheMemory, cacheGuardianMemory, fiberType) + case "noevict" => new NoEvictPMCache(cacheMemory, cacheGuardianMemory, fiberType) + case "external" => new ExternalCache(fiberType) case _ => throw new UnsupportedOperationException( s"The cache backend: ${oapCacheOpt} is not supported now") } @@ -289,7 +296,7 @@ trait OapCache { } -class NonEvictPMCache(pmSize: Long, +class NoEvictPMCache(pmSize: Long, cacheGuardianMemory: Long, fiberType: FiberType) extends OapCache with Logging { // We don't bother the memory use of Simple Cache @@ -739,22 +746,8 @@ class MixCache(dataCacheMemory: Long, private val (dataCacheBackend, indexCacheBackend) = init() - private def isCompatibleWithMemoryManager() = { - val dataMemoryManager = sparkEnv.conf.get(OapConf.OAP_MIX_DATA_MEMORY_MANAGER) - val indexMemoryManager = sparkEnv.conf.get(OapConf.OAP_MIX_INDEX_MEMORY_MANAGER) - val dataCacheBackend = sparkEnv.conf.get(OapConf.OAP_MIX_DATA_CACHE_BACKEND) - val indexCacheBackend = OapConf.OAP_MIX_INDEX_CACHE_BACKEND - - // TmpDramMemoryManager is designed only for VMemcache - if ((dataCacheBackend.equals("vmem") && !dataMemoryManager.equals("tmp")) || - (indexCacheBackend.equals("vmem") && !indexMemoryManager.equals("tmp"))) { - throw new OapException("Please configure TmpDramMemoryManager(tmp) for VMemcache(vmem)") - } - } - private def init(): (OapCache, OapCache) = { if (!separation) { - isCompatibleWithMemoryManager() val dataCacheBackend = OapCache(sparkEnv, OapConf.OAP_MIX_DATA_CACHE_BACKEND, dataCacheMemory, dataCacheGuardianMemory, FiberType.DATA); val indexCacheBackend = OapCache(sparkEnv, OapConf.OAP_MIX_INDEX_CACHE_BACKEND, @@ -852,3 +845,199 @@ class MixCache(dataCacheMemory: Long, indexCacheBackend.cleanUp() } } + +class ExternalCache(fiberType: FiberType) extends OapCache with Logging { + private val conf = SparkEnv.get.conf + private val externalStoreCacheSocket: String = "/tmp/plasmaStore" + private var cacheInit: Boolean = false + def init(): Unit = { + if (!cacheInit) { + try { + System.loadLibrary("plasma_java") + cacheInit = true + } catch { + case e: Exception => logError(s"load plasma jni lib failed " + e.getMessage) + } + } + } + + init() + + private val cacheHitCount: AtomicLong = new AtomicLong(0) + private val cacheMissCount: AtomicLong = new AtomicLong(0) + private val cacheTotalGetTime: AtomicLong = new AtomicLong(0) + private var cacheTotalCount: AtomicLong = new AtomicLong(0) + private var cacheEvictCount: AtomicLong = new AtomicLong(0) + private var cacheTotalSize: AtomicLong = new AtomicLong(0) + + private def emptyDataFiber(fiberLength: Long): FiberCache = + OapRuntime.getOrCreate.fiberCacheManager.getEmptyDataFiberCache(fiberLength) + + var fiberSet = scala.collection.mutable.Set[FiberId]() + val clientPoolSize = conf.get(OapConf.OAP_EXTERNAL_CACHE_CLIENT_POOL_SIZE) + val clientRoundRobin = new AtomicInteger(0) + val plasmaClientPool = new Array[ plasma.PlasmaClient](clientPoolSize) + for ( i <- 0 until clientPoolSize) { + plasmaClientPool(i) = new plasma.PlasmaClient(externalStoreCacheSocket, "", 0) + } + + val cacheGuardian = new MultiThreadCacheGuardian(Int.MaxValue) + cacheGuardian.start() + + val hf: HashFunction = Hashing.murmur3_128() + + def hash(key: Array[Byte]): Array[Byte] = { + val ret = new Array[Byte](20) + hf.newHasher().putBytes(key).hash().writeBytesTo(ret, 0, 20) + ret + } + + def hash(key: String): Array[Byte] = { + hash(key.getBytes()) + } + + def delete(fiberId: FiberId): Unit = { + val objectId = hash(fiberId.toString) + plasmaClientPool(clientRoundRobin.getAndAdd(1) % clientPoolSize).delete(objectId) + } + + def contains(fiberId: FiberId): Boolean = { + val objectId = hash(fiberId.toString) + if (plasmaClientPool(clientRoundRobin.getAndAdd(1) % clientPoolSize).contains(objectId)) true + else false + } + + override def get(fiberId: FiberId): FiberCache = { + logDebug(s"external cache get FiberId is ${fiberId}") + val objectId = hash(fiberId.toString) + if(contains(fiberId)) { + var fiberCache : FiberCache = null + try{ + logDebug(s"Cache hit, get from external cache.") + val plasmaClient = plasmaClientPool(clientRoundRobin.getAndAdd(1) % clientPoolSize) + val buf: ByteBuffer = plasmaClient.getObjAsByteBuffer(objectId, -1, false) + cacheHitCount.addAndGet(1) + fiberCache = emptyDataFiber(buf.capacity()) + fiberCache.fiberId = fiberId + Platform.copyMemory(null, buf.asInstanceOf[DirectBuffer].address(), + null, fiberCache.fiberData.baseOffset, buf.capacity()) + plasmaClient.release(objectId) + } + catch { + case getException : plasma.exceptions.PlasmaGetException => + logWarning("Get exception: " + getException.getMessage) + fiberCache = cache(fiberId) + cacheMissCount.addAndGet(1) + } + fiberCache.occupy() + cacheGuardian.addRemovalFiber(fiberId, fiberCache) + fiberCache + } else { + val fiberCache = cache(fiberId) + cacheMissCount.addAndGet(1) + fiberSet.add(fiberId) + fiberCache.occupy() + cacheGuardian.addRemovalFiber(fiberId, fiberCache) + fiberCache + } + } + + override def cache(fiberId: FiberId): FiberCache = { + val fiber = super.cache(fiberId) + fiber.fiberId = fiberId + + val objectId = hash(fiberId.toString) + if( !contains(fiberId)) { + val plasmaClient = plasmaClientPool(clientRoundRobin.getAndAdd(1) % clientPoolSize) + try { + val buf = plasmaClient.create(objectId, fiber.size().toInt) + Platform.copyMemory(null, fiber.fiberData.baseOffset, + null, buf.asInstanceOf[DirectBuffer].address(), fiber.size()) + plasmaClient.seal(objectId) + plasmaClient.release(objectId) + } catch { + case e: DuplicateObjectException => logWarning(e.getMessage) + } + } + fiber + } + + private val _cacheSize: AtomicLong = new AtomicLong(0) + + override def getIfPresent(fiber: FiberId): FiberCache = null + + override def getFibers: Set[FiberId] = { + val list : Array[Array[Byte]] = + plasmaClientPool(clientRoundRobin.getAndAdd(1) % clientPoolSize).list(); + cacheTotalCount = new AtomicLong(list.length) + logDebug("cache total size is " + cacheTotalCount) + list.toSet + fiberSet.foreach( fiber => + if ( !list.contains(hash(fiber.toFiberKey()))) fiberSet.remove(fiber) ) + fiberSet.toSet + } + + override def invalidate(fiber: FiberId): Unit = { } + + override def invalidateAll(fibers: Iterable[FiberId]): Unit = { } + + override def cacheSize: Long = _cacheSize.get() + + override def cacheCount: Long = 0 + + override def cacheStats: CacheStats = { + val array = new Array[Long](4) +// plasmaClientPool(clientRoundRobin.getAndAdd(1) % clientPoolSize).metrics(array) + cacheTotalSize = new AtomicLong(array(3) + array(1)) + // Memory store and external store used size + + if (fiberType == FiberType.INDEX) { + CacheStats( + 0, 0, + cacheTotalCount.get(), + cacheTotalSize.get(), + cacheGuardian.pendingFiberCount, // pendingFiberCount + cacheGuardian.pendingFiberSize, // pendingFiberSize + 0, 0, 0, 0, 0, // For index cache, the data fiber metrics should always be zero + cacheHitCount.get(), // indexFiberHitCount + cacheMissCount.get(), // indexFiberMissCount + cacheHitCount.get(), // indexFiberLoadCount + cacheTotalGetTime.get(), // indexTotalLoadTime + cacheEvictCount.get() // indexEvictionCount + ) + } else { + CacheStats( + cacheTotalCount.get(), + cacheTotalSize.get(), + 0, 0, + cacheGuardian.pendingFiberCount, // pendingFiberCount + cacheGuardian.pendingFiberSize, // pendingFiberSize + cacheHitCount.get(), // dataFiberHitCount + cacheMissCount.get(), // dataFiberMissCount + cacheHitCount.get(), // dataFiberLoadCount + cacheTotalGetTime.get(), // dataTotalLoadTime + cacheEvictCount.get(), // dataEvictionCount + 0, 0, 0, 0, 0) // For data cache, the index fiber metrics should always be zero + } + } + + override def pendingFiberCount: Int = { + cacheGuardian.pendingFiberCount + } + + override def dataCacheCount: Long = 0 + + override def pendingFiberSize: Long = cacheGuardian.pendingFiberSize + + override def pendingFiberOccupiedSize: Long = cacheGuardian.pendingFiberOccupiedSize + + override def getCacheGuardian: CacheGuardian = cacheGuardian + + override def cleanUp(): Unit = { + invalidateAll(getFibers) + dataFiberSize.set(0L) + dataFiberCount.set(0L) + indexFiberSize.set(0L) + indexFiberCount.set(0L) + } +} diff --git a/oap-cache/oap/src/main/scala/org/apache/spark/sql/internal/oap/OapConf.scala b/oap-cache/oap/src/main/scala/org/apache/spark/sql/internal/oap/OapConf.scala index d1eed8269..1131ee6cd 100644 --- a/oap-cache/oap/src/main/scala/org/apache/spark/sql/internal/oap/OapConf.scala +++ b/oap-cache/oap/src/main/scala/org/apache/spark/sql/internal/oap/OapConf.scala @@ -426,6 +426,13 @@ object OapConf { .stringConf .createWithDefault("") + val OAP_EXTERNAL_CACHE_CLIENT_POOL_SIZE = + SqlConfAdapter.buildConf("spark.sql.oap.cache.external.client.pool.size") + .internal() + .doc("client pool for external cache") + .intConf + .createWithDefault(1) + val OAP_CACHE_GUARDIAN_FREE_THREAD_NUM = SqlConfAdapter.buildConf("spark.sql.oap.cache.guardian.free.thread.nums") .internal() diff --git a/oap-cache/oap/src/main/spark2.4.4/scala/org/apache/spark/SparkEnv.scala b/oap-cache/oap/src/main/spark2.4.4/scala/org/apache/spark/SparkEnv.scala new file mode 100644 index 000000000..d0217a3d2 --- /dev/null +++ b/oap-cache/oap/src/main/spark2.4.4/scala/org/apache/spark/SparkEnv.scala @@ -0,0 +1,453 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark + +import java.io.File +import java.net.Socket +import java.util.Locale + +import scala.collection.mutable +import scala.util.Properties + +import com.google.common.collect.MapMaker + +import org.apache.spark.annotation.DeveloperApi +import org.apache.spark.api.python.PythonWorkerFactory +import org.apache.spark.broadcast.BroadcastManager +import org.apache.spark.internal.Logging +import org.apache.spark.internal.config._ +import org.apache.spark.memory.{MemoryManager, StaticMemoryManager, UnifiedMemoryManager} +import org.apache.spark.metrics.MetricsSystem +import org.apache.spark.network.netty.NettyBlockTransferService +import org.apache.spark.rpc.{RpcEndpoint, RpcEndpointRef, RpcEnv} +import org.apache.spark.scheduler.{LiveListenerBus, OutputCommitCoordinator} +import org.apache.spark.scheduler.OutputCommitCoordinator.OutputCommitCoordinatorEndpoint +import org.apache.spark.security.CryptoStreamUtils +import org.apache.spark.serializer.{JavaSerializer, Serializer, SerializerManager} +import org.apache.spark.shuffle.ShuffleManager +import org.apache.spark.sql.oap.OapRuntime +import org.apache.spark.storage._ +import org.apache.spark.util.{RpcUtils, Utils} + +/** + * :: DeveloperApi :: + * Holds all the runtime environment objects for a running Spark instance (either master or worker), + * including the serializer, RpcEnv, block manager, map output tracker, etc. Currently + * Spark code finds the SparkEnv through a global variable, so all the threads can access the same + * SparkEnv. It can be accessed by SparkEnv.get (e.g. after creating a SparkContext). + * + * NOTE: This is not intended for external use. This is exposed for Shark and may be made private + * in a future release. + */ +@DeveloperApi +class SparkEnv ( + val executorId: String, + private[spark] val rpcEnv: RpcEnv, + val serializer: Serializer, + val closureSerializer: Serializer, + val serializerManager: SerializerManager, + val mapOutputTracker: MapOutputTracker, + val shuffleManager: ShuffleManager, + val broadcastManager: BroadcastManager, + val blockManager: BlockManager, + val securityManager: SecurityManager, + val metricsSystem: MetricsSystem, + val memoryManager: MemoryManager, + val outputCommitCoordinator: OutputCommitCoordinator, + val conf: SparkConf) extends Logging { + + private[spark] var isStopped = false + private val pythonWorkers = mutable.HashMap[(String, Map[String, String]), PythonWorkerFactory]() + + // A general, soft-reference map for metadata needed during HadoopRDD split computation + // (e.g., HadoopFileRDD uses this to cache JobConfs and InputFormats). + private[spark] val hadoopJobMetadata = new MapMaker().softValues().makeMap[String, Any]() + + private[spark] var driverTmpDir: Option[String] = None + + private[spark] def stop() { + + if (!isStopped) { + isStopped = true + pythonWorkers.values.foreach(_.stop()) + OapRuntime.stop() + mapOutputTracker.stop() + shuffleManager.stop() + broadcastManager.stop() + blockManager.stop() + blockManager.master.stop() + metricsSystem.stop() + outputCommitCoordinator.stop() + rpcEnv.shutdown() + rpcEnv.awaitTermination() + + // If we only stop sc, but the driver process still run as a services then we need to delete + // the tmp dir, if not, it will create too many tmp dirs. + // We only need to delete the tmp dir create by driver + driverTmpDir match { + case Some(path) => + try { + Utils.deleteRecursively(new File(path)) + } catch { + case e: Exception => + logWarning(s"Exception while deleting Spark temp dir: $path", e) + } + case None => // We just need to delete tmp dir created by driver, so do nothing on executor + } + } + } + + private[spark] + def createPythonWorker(pythonExec: String, envVars: Map[String, String]): java.net.Socket = { + synchronized { + val key = (pythonExec, envVars) + pythonWorkers.getOrElseUpdate(key, new PythonWorkerFactory(pythonExec, envVars)).create() + } + } + + private[spark] + def destroyPythonWorker(pythonExec: String, envVars: Map[String, String], worker: Socket) { + synchronized { + val key = (pythonExec, envVars) + pythonWorkers.get(key).foreach(_.stopWorker(worker)) + } + } + + private[spark] + def releasePythonWorker(pythonExec: String, envVars: Map[String, String], worker: Socket) { + synchronized { + val key = (pythonExec, envVars) + pythonWorkers.get(key).foreach(_.releaseWorker(worker)) + } + } +} + +object SparkEnv extends Logging { + @volatile private var env: SparkEnv = _ + + private[spark] val driverSystemName = "sparkDriver" + private[spark] val executorSystemName = "sparkExecutor" + + def set(e: SparkEnv) { + env = e + } + + /** + * Returns the SparkEnv. + */ + def get: SparkEnv = { + env + } + + /** + * Create a SparkEnv for the driver. + */ + private[spark] def createDriverEnv( + conf: SparkConf, + isLocal: Boolean, + listenerBus: LiveListenerBus, + numCores: Int, + mockOutputCommitCoordinator: Option[OutputCommitCoordinator] = None): SparkEnv = { + assert(conf.contains(DRIVER_HOST_ADDRESS), + s"${DRIVER_HOST_ADDRESS.key} is not set on the driver!") + assert(conf.contains("spark.driver.port"), "spark.driver.port is not set on the driver!") + val bindAddress = conf.get(DRIVER_BIND_ADDRESS) + val advertiseAddress = conf.get(DRIVER_HOST_ADDRESS) + val port = conf.get("spark.driver.port").toInt + val ioEncryptionKey = if (conf.get(IO_ENCRYPTION_ENABLED)) { + Some(CryptoStreamUtils.createKey(conf)) + } else { + None + } + create( + conf, + SparkContext.DRIVER_IDENTIFIER, + bindAddress, + advertiseAddress, + Option(port), + isLocal, + numCores, + ioEncryptionKey, + listenerBus = listenerBus, + mockOutputCommitCoordinator = mockOutputCommitCoordinator + ) + } + + /** + * Create a SparkEnv for an executor. + * In coarse-grained mode, the executor provides an RpcEnv that is already instantiated. + */ + private[spark] def createExecutorEnv( + conf: SparkConf, + executorId: String, + hostname: String, + numCores: Int, + ioEncryptionKey: Option[Array[Byte]], + isLocal: Boolean): SparkEnv = { + val env = create( + conf, + executorId, + hostname, + hostname, + None, + isLocal, + numCores, + ioEncryptionKey + ) + SparkEnv.set(env) + env + } + + /** + * Helper method to create a SparkEnv for a driver or an executor. + */ + private def create( + conf: SparkConf, + executorId: String, + bindAddress: String, + advertiseAddress: String, + port: Option[Int], + isLocal: Boolean, + numUsableCores: Int, + ioEncryptionKey: Option[Array[Byte]], + listenerBus: LiveListenerBus = null, + mockOutputCommitCoordinator: Option[OutputCommitCoordinator] = None): SparkEnv = { + + val isDriver = executorId == SparkContext.DRIVER_IDENTIFIER + + // Listener bus is only used on the driver + if (isDriver) { + assert(listenerBus != null, "Attempted to create driver SparkEnv with null listener bus!") + } + + val securityManager = new SecurityManager(conf, ioEncryptionKey) + if (isDriver) { + securityManager.initializeAuth() + } + + ioEncryptionKey.foreach { _ => + if (!securityManager.isEncryptionEnabled()) { + logWarning("I/O encryption enabled without RPC encryption: keys will be visible on the " + + "wire.") + } + } + + val systemName = if (isDriver) driverSystemName else executorSystemName + val rpcEnv = RpcEnv.create(systemName, bindAddress, advertiseAddress, port.getOrElse(-1), conf, + securityManager, numUsableCores, !isDriver) + + // Figure out which port RpcEnv actually bound to in case the original port is 0 or occupied. + if (isDriver) { + conf.set("spark.driver.port", rpcEnv.address.port.toString) + } + + // Create an instance of the class with the given name, possibly initializing it with our conf + def instantiateClass[T](className: String): T = { + val cls = Utils.classForName(className) + // Look for a constructor taking a SparkConf and a boolean isDriver, then one taking just + // SparkConf, then one taking no arguments + try { + cls.getConstructor(classOf[SparkConf], java.lang.Boolean.TYPE) + .newInstance(conf, new java.lang.Boolean(isDriver)) + .asInstanceOf[T] + } catch { + case _: NoSuchMethodException => + try { + cls.getConstructor(classOf[SparkConf]).newInstance(conf).asInstanceOf[T] + } catch { + case _: NoSuchMethodException => + cls.getConstructor().newInstance().asInstanceOf[T] + } + } + } + + // Create an instance of the class named by the given SparkConf property, or defaultClassName + // if the property is not set, possibly initializing it with our conf + def instantiateClassFromConf[T](propertyName: String, defaultClassName: String): T = { + instantiateClass[T](conf.get(propertyName, defaultClassName)) + } + + val serializer = instantiateClassFromConf[Serializer]( + "spark.serializer", "org.apache.spark.serializer.JavaSerializer") + logDebug(s"Using serializer: ${serializer.getClass}") + + val serializerManager = new SerializerManager(serializer, conf, ioEncryptionKey) + + val closureSerializer = new JavaSerializer(conf) + + def registerOrLookupEndpoint( + name: String, endpointCreator: => RpcEndpoint): + RpcEndpointRef = { + if (isDriver) { + logInfo("Registering " + name) + rpcEnv.setupEndpoint(name, endpointCreator) + } else { + RpcUtils.makeDriverRef(name, conf, rpcEnv) + } + } + + val broadcastManager = new BroadcastManager(isDriver, conf, securityManager) + + val mapOutputTracker = if (isDriver) { + new MapOutputTrackerMaster(conf, broadcastManager, isLocal) + } else { + new MapOutputTrackerWorker(conf) + } + + // Have to assign trackerEndpoint after initialization as MapOutputTrackerEndpoint + // requires the MapOutputTracker itself + mapOutputTracker.trackerEndpoint = registerOrLookupEndpoint(MapOutputTracker.ENDPOINT_NAME, + new MapOutputTrackerMasterEndpoint( + rpcEnv, mapOutputTracker.asInstanceOf[MapOutputTrackerMaster], conf)) + + // Let the user specify short names for shuffle managers + val shortShuffleMgrNames = Map( + "sort" -> classOf[org.apache.spark.shuffle.sort.SortShuffleManager].getName, + "tungsten-sort" -> classOf[org.apache.spark.shuffle.sort.SortShuffleManager].getName) + val shuffleMgrName = conf.get("spark.shuffle.manager", "sort") + val shuffleMgrClass = + shortShuffleMgrNames.getOrElse(shuffleMgrName.toLowerCase(Locale.ROOT), shuffleMgrName) + val shuffleManager = instantiateClass[ShuffleManager](shuffleMgrClass) + + val useLegacyMemoryManager = conf.getBoolean("spark.memory.useLegacyMode", false) + val memoryManager: MemoryManager = + if (useLegacyMemoryManager) { + new StaticMemoryManager(conf, numUsableCores) + } else { + UnifiedMemoryManager(conf, numUsableCores) + } + + val blockManagerPort = if (isDriver) { + conf.get(DRIVER_BLOCK_MANAGER_PORT) + } else { + conf.get(BLOCK_MANAGER_PORT) + } + + val blockTransferService = + new NettyBlockTransferService(conf, securityManager, bindAddress, advertiseAddress, + blockManagerPort, numUsableCores) + + val blockManagerMaster = new BlockManagerMaster(registerOrLookupEndpoint( + BlockManagerMaster.DRIVER_ENDPOINT_NAME, + new BlockManagerMasterEndpoint(rpcEnv, isLocal, conf, listenerBus)), + conf, isDriver) + + // NB: blockManager is not valid until initialize() is called later. + val blockManager = new BlockManager(executorId, rpcEnv, blockManagerMaster, + serializerManager, conf, memoryManager, mapOutputTracker, shuffleManager, + blockTransferService, securityManager, numUsableCores) + + val metricsSystem = if (isDriver) { + // Don't start metrics system right now for Driver. + // We need to wait for the task scheduler to give us an app ID. + // Then we can start the metrics system. + MetricsSystem.createMetricsSystem("driver", conf, securityManager) + } else { + // We need to set the executor ID before the MetricsSystem is created because sources and + // sinks specified in the metrics configuration file will want to incorporate this executor's + // ID into the metrics they report. + conf.set("spark.executor.id", executorId) + val ms = MetricsSystem.createMetricsSystem("executor", conf, securityManager) + ms.start() + ms + } + + val outputCommitCoordinator = mockOutputCommitCoordinator.getOrElse { + new OutputCommitCoordinator(conf, isDriver) + } + val outputCommitCoordinatorRef = registerOrLookupEndpoint("OutputCommitCoordinator", + new OutputCommitCoordinatorEndpoint(rpcEnv, outputCommitCoordinator)) + outputCommitCoordinator.coordinatorRef = Some(outputCommitCoordinatorRef) + + val envInstance = new SparkEnv( + executorId, + rpcEnv, + serializer, + closureSerializer, + serializerManager, + mapOutputTracker, + shuffleManager, + broadcastManager, + blockManager, + securityManager, + metricsSystem, + memoryManager, + outputCommitCoordinator, + conf) + + // Add a reference to tmp dir created by driver, we will delete this tmp dir when stop() is + // called, and we only need to do it for driver. Because driver may run as a service, and if we + // don't delete this tmp dir when sc is stopped, then will create too many tmp dirs. + if (isDriver) { + val sparkFilesDir = Utils.createTempDir(Utils.getLocalDir(conf), "userFiles").getAbsolutePath + envInstance.driverTmpDir = Some(sparkFilesDir) + } + + envInstance + } + + /** + * Return a map representation of jvm information, Spark properties, system properties, and + * class paths. Map keys define the category, and map values represent the corresponding + * attributes as a sequence of KV pairs. This is used mainly for SparkListenerEnvironmentUpdate. + */ + private[spark] + def environmentDetails( + conf: SparkConf, + schedulingMode: String, + addedJars: Seq[String], + addedFiles: Seq[String]): Map[String, Seq[(String, String)]] = { + + import Properties._ + val jvmInformation = Seq( + ("Java Version", s"$javaVersion ($javaVendor)"), + ("Java Home", javaHome), + ("Scala Version", versionString) + ).sorted + + // Spark properties + // This includes the scheduling mode whether or not it is configured (used by SparkUI) + val schedulerMode = + if (!conf.contains("spark.scheduler.mode")) { + Seq(("spark.scheduler.mode", schedulingMode)) + } else { + Seq.empty[(String, String)] + } + val sparkProperties = (conf.getAll ++ schedulerMode).sorted + + // System properties that are not java classpaths + val systemProperties = Utils.getSystemProperties.toSeq + val otherProperties = systemProperties.filter { case (k, _) => + k != "java.class.path" && !k.startsWith("spark.") + }.sorted + + // Class paths including all added jars and files + val classPathEntries = javaClassPath + .split(File.pathSeparator) + .filterNot(_.isEmpty) + .map((_, "System Classpath")) + val addedJarsAndFiles = (addedJars ++ addedFiles).map((_, "Added By User")) + val classPaths = (addedJarsAndFiles ++ classPathEntries).sorted + + Map[String, Seq[(String, String)]]( + "JVM Information" -> jvmInformation, + "Spark Properties" -> sparkProperties, + "System Properties" -> otherProperties, + "Classpath Entries" -> classPaths) + } +} diff --git a/oap-cache/oap/src/test/assembly/test-jar-with-dependencies.xml b/oap-cache/oap/src/test/assembly/test-jar-with-dependencies.xml new file mode 100644 index 000000000..bcafe2888 --- /dev/null +++ b/oap-cache/oap/src/test/assembly/test-jar-with-dependencies.xml @@ -0,0 +1,19 @@ + + test-jar-with-dependencies + + jar + + false + + + ./ + true + + true + true + runtime + + + \ No newline at end of file diff --git a/oap-cache/oap/src/test/oap-perf-suite/conf/oap-benchmark-default.conf b/oap-cache/oap/src/test/oap-perf-suite/conf/oap-benchmark-default.conf new file mode 100644 index 000000000..4076f5e66 --- /dev/null +++ b/oap-cache/oap/src/test/oap-perf-suite/conf/oap-benchmark-default.conf @@ -0,0 +1,9 @@ +# Sample conf for future dev, and it is able to be accessed in running folder of the perf-suite. +# All options are used in data gen stage, and scale is also used to format the database name of +# each suite. + +oap.benchmark.hdfs.file.root.dir /dailytest +oap.benchmark.tpcds.data.scale 200 +oap.benchmark.tpcds.data.partition 80 +oap.benchmark.tpcds.data.format parquet + diff --git a/oap-cache/oap/src/test/oap-perf-suite/scala/org/apache/spark/sql/BenchmarkConfig.scala b/oap-cache/oap/src/test/oap-perf-suite/scala/org/apache/spark/sql/BenchmarkConfig.scala new file mode 100644 index 000000000..fb44c085a --- /dev/null +++ b/oap-cache/oap/src/test/oap-perf-suite/scala/org/apache/spark/sql/BenchmarkConfig.scala @@ -0,0 +1,406 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.spark.sql + +import org.apache.spark + +import scala.collection.mutable +import org.apache.spark.SparkConf +import org.apache.spark.sql.internal.oap.OapConf + + +// TODO: use SQLConf style i.e. (value, defaultValue) +class BenchmarkConfig { + // Benchmark config, include file format, index use or not, etc. + private val benchmarkConf: mutable.HashMap[String, String] = mutable.HashMap.empty + + // Spark conf, to initial spark session. + private val sparkConf: mutable.HashMap[String, String] = mutable.HashMap.empty + + def setBenchmarkConf(name: String, value: String): BenchmarkConfig = { + benchmarkConf.put(name, value) + this + } + + /** A meaningful name for this config + * like "oap + index" or "parquet w/o index" or "oap and oapStrategy enable" + */ + def setBenchmarkConfName(name: String): BenchmarkConfig = { + confName = Option(name) + this + } + + var confName: Option[String] = None + + def setSparkConf(name: String, value: String): BenchmarkConfig = { + sparkConf.put(name, value) + this + } + + /** + * Find a conf from all conf settings. + */ + def getConf(name: String): String = { + benchmarkConf.get(name).getOrElse( + sparkConf.get(name).getOrElse( + s"$name Not Exist!!!")) + } + + /** + * Get benchmark config + * @param name: name + * @return benchmark config setting. + */ + def getBenchmarkConf(name: String): String = benchmarkConf.getOrElse(name, "false") + + /** + * Get spark config + * @param name: name + * @return sql config setting. + */ + def getSparkConf(name: String): String = sparkConf.getOrElse(name, "false") + + /** + * Get all spark config + * @return all spark config settings. + */ + def allSparkOptions(): Map[String, String] = sparkConf.toMap[String, String] + + /** + * Make config settings as config name, used if none name set. + * @return + */ + def configString: String = { + if (sparkConf.isEmpty) { + val indexEnable = if (getBenchmarkConf(BenchmarkConfig.INDEX_ENABLE).toBoolean) { + "W/ Index" + } else { + "W/O Index" + } + + s"${getBenchmarkConf(BenchmarkConfig.FILE_FORMAT)} $indexEnable" + } else { + // oap !eis & statistics + getBenchmarkConf(BenchmarkConfig.FILE_FORMAT) + " " + sparkConf.toArray.map{ setting => + val flag = if (setting._2 == "true") { + "" + } else { + "!" + } + flag + setting._1.split('.')(4) + }.mkString(getBenchmarkConf(BenchmarkConfig.FILE_FORMAT) + " ", " & ", "") + } + } + + override def toString: String = { + confName match { + case Some(name) => name + case None => configString + } + } +} + +object BenchmarkConfig { + val INDEX_ENABLE = "oap.benchmark.config.index" + val FILE_FORMAT = "oap.benchmark.config.format" +} + +abstract class BenchmarkConfigSelector { + // TODO: choose conf + def allConfigurations: Seq[BenchmarkConfig] +} + +object BenchmarkConfigSelector { + // TODO: build config accordingly. + val wildcardConfiguration: mutable.HashMap[String, String] = mutable.HashMap.empty + + def build(options: Map[String, String]): Unit = { + wildcardConfiguration ++= options + } + + def isSelected(config: BenchmarkConfig): Boolean = { + if (wildcardConfiguration.nonEmpty) { + wildcardConfiguration.exists{conf => + config.getConf(conf._1) == conf._2 || + config.confName.equals(conf._2) + } + } else { + true + } + } +} + +trait ParquetOnlyConfigSet extends BenchmarkConfigSelector{ + // TODO: choose conf + def allConfigurations: Seq[BenchmarkConfig] = Seq( + new BenchmarkConfig() + .setBenchmarkConfName("parquet w/ index") + .setBenchmarkConf(BenchmarkConfig.FILE_FORMAT, "parquet") + .setBenchmarkConf(BenchmarkConfig.INDEX_ENABLE, "true"), + new BenchmarkConfig() + .setBenchmarkConfName("parquet w/o index") + .setBenchmarkConf(BenchmarkConfig.FILE_FORMAT, "parquet") + .setBenchmarkConf(BenchmarkConfig.INDEX_ENABLE, "false") + ) +} + +trait OrcOnlyConfigSet extends BenchmarkConfigSelector{ + def allConfigurations: Seq[BenchmarkConfig] = Seq( + new BenchmarkConfig() + .setBenchmarkConfName("Orc w/ index") + .setBenchmarkConf(BenchmarkConfig.FILE_FORMAT, "orc") + .setBenchmarkConf(BenchmarkConfig.INDEX_ENABLE, "true"), + new BenchmarkConfig() + .setBenchmarkConfName("Orc w/o index") + .setBenchmarkConf(BenchmarkConfig.FILE_FORMAT, "orc") + .setBenchmarkConf(BenchmarkConfig.INDEX_ENABLE, "false") + ) +} + +trait ParquetVsOrcConfigSet extends BenchmarkConfigSelector{ + // TODO: choose conf + def allConfigurations: Seq[BenchmarkConfig] = if( new SparkConf().get("spark.sql.oap.fiberCache.memory.manager")=="offheap"){ + Seq( + new BenchmarkConfig() + .setBenchmarkConfName("Orc w/ index") + .setBenchmarkConf(BenchmarkConfig.FILE_FORMAT, "orc") + .setBenchmarkConf(BenchmarkConfig.INDEX_ENABLE, "true"), + new BenchmarkConfig() + .setBenchmarkConfName("Orc w/ index oap cache enabled") + .setBenchmarkConf(BenchmarkConfig.FILE_FORMAT, "orc") + .setBenchmarkConf(BenchmarkConfig.INDEX_ENABLE, "true") + .setSparkConf("spark.sql.oap.orc.data.cache.enable", "true") + .setSparkConf("spark.sql.orc.copyBatchToSpark", "true"), + new BenchmarkConfig() + .setBenchmarkConfName("Orc w/ index oap binary cache enabled") + .setBenchmarkConf(BenchmarkConfig.FILE_FORMAT, "orc") + .setBenchmarkConf(BenchmarkConfig.INDEX_ENABLE, "true") + .setSparkConf("spark.sql.oap.orc.binary.cache.enable", "true"), + new BenchmarkConfig() + .setBenchmarkConfName("orc w/ index data cache separation same medium enabled") + .setBenchmarkConf(BenchmarkConfig.FILE_FORMAT, "orc") + .setBenchmarkConf(BenchmarkConfig.INDEX_ENABLE, "true") + .setSparkConf("spark.sql.oap.orc.data.cache.enable", "true") + .setSparkConf("spark.sql.orc.copyBatchToSpark", "true") + .setSparkConf("spark.sql.oap.index.data.cache.separation.enable", "true") + .setSparkConf("spark.oap.cache.strategy", "mix"), + new BenchmarkConfig() + .setBenchmarkConfName("orc w/ index binary data cache separation same medium enabled") + .setBenchmarkConf(BenchmarkConfig.FILE_FORMAT, "orc") + .setBenchmarkConf(BenchmarkConfig.INDEX_ENABLE, "true") + .setSparkConf("spark.sql.oap.orc.binary.cache.enable", "true") + .setSparkConf("spark.sql.oap.index.data.cache.separation.enable", "true") + .setSparkConf("spark.oap.cache.strategy", "mix"), + new BenchmarkConfig() + .setBenchmarkConfName("Orc w/o index") + .setBenchmarkConf(BenchmarkConfig.FILE_FORMAT, "orc") + .setBenchmarkConf(BenchmarkConfig.INDEX_ENABLE, "false"), + new BenchmarkConfig() + .setBenchmarkConfName("Orc w/o index oap cache enabled") + .setBenchmarkConf(BenchmarkConfig.FILE_FORMAT, "orc") + .setBenchmarkConf(BenchmarkConfig.INDEX_ENABLE, "false") + .setSparkConf("spark.sql.oap.orc.data.cache.enable", "true") + .setSparkConf("spark.sql.orc.copyBatchToSpark", "true"), + new BenchmarkConfig() + .setBenchmarkConfName("Orc w/o index oap binary cache enabled") + .setBenchmarkConf(BenchmarkConfig.FILE_FORMAT, "orc") + .setBenchmarkConf(BenchmarkConfig.INDEX_ENABLE, "false") + .setSparkConf("spark.sql.oap.orc.binary.cache.enable", "true"), + new BenchmarkConfig() + .setBenchmarkConfName("parquet w/ index") + .setBenchmarkConf(BenchmarkConfig.FILE_FORMAT, "parquet") + .setBenchmarkConf(BenchmarkConfig.INDEX_ENABLE, "true"), + new BenchmarkConfig() + .setBenchmarkConfName("parquet w/ index oap cache enabled") + .setBenchmarkConf(BenchmarkConfig.FILE_FORMAT, "parquet") + .setBenchmarkConf(BenchmarkConfig.INDEX_ENABLE, "true") + .setSparkConf("spark.sql.oap.parquet.data.cache.enable", "true"), + new BenchmarkConfig() + .setBenchmarkConfName("parquet w/ index oap binary cache enabled") + .setBenchmarkConf(BenchmarkConfig.FILE_FORMAT, "parquet") + .setBenchmarkConf(BenchmarkConfig.INDEX_ENABLE, "true") + .setSparkConf("spark.sql.oap.parquet.binary.cache.enable", "true"), + new BenchmarkConfig() + .setBenchmarkConfName("parquet w/o index") + .setBenchmarkConf(BenchmarkConfig.FILE_FORMAT, "parquet") + .setBenchmarkConf(BenchmarkConfig.INDEX_ENABLE, "false"), + new BenchmarkConfig() + .setBenchmarkConfName("parquet w/o index oap cache enabled") + .setBenchmarkConf(BenchmarkConfig.FILE_FORMAT, "parquet") + .setBenchmarkConf(BenchmarkConfig.INDEX_ENABLE, "false") + .setSparkConf("spark.sql.oap.parquet.data.cache.enable", "true"), + new BenchmarkConfig() + .setBenchmarkConfName("parquet w/o index oap binary cache enabled") + .setBenchmarkConf(BenchmarkConfig.FILE_FORMAT, "parquet") + .setBenchmarkConf(BenchmarkConfig.INDEX_ENABLE, "false") + .setSparkConf("spark.sql.oap.parquet.binary.cache.enable", "true"), + new BenchmarkConfig() + .setBenchmarkConfName("parquet w/ index data cache separation same medium enabled") + .setBenchmarkConf(BenchmarkConfig.FILE_FORMAT, "parquet") + .setBenchmarkConf(BenchmarkConfig.INDEX_ENABLE, "true") + .setSparkConf("spark.sql.oap.parquet.data.cache.enable", "true") + .setSparkConf("spark.sql.oap.index.data.cache.separation.enable", "true") + .setSparkConf("spark.oap.cache.strategy", "mix"), + new BenchmarkConfig() + .setBenchmarkConfName("parquet w/ index binary data cache separation same medium enabled") + .setBenchmarkConf(BenchmarkConfig.FILE_FORMAT, "parquet") + .setBenchmarkConf(BenchmarkConfig.INDEX_ENABLE, "true") + .setSparkConf("spark.sql.oap.parquet.binary.cache.enable", "true") + .setSparkConf("spark.sql.oap.index.data.cache.separation.enable", "true") + .setSparkConf("spark.oap.cache.strategy", "mix") + ) + }else{ + Seq( + new BenchmarkConfig() + .setBenchmarkConfName("Orc w/ index") + .setBenchmarkConf(BenchmarkConfig.FILE_FORMAT, "orc") + .setBenchmarkConf(BenchmarkConfig.INDEX_ENABLE, "true"), + new BenchmarkConfig() + .setBenchmarkConfName("Orc w/ index oap cache enabled") + .setBenchmarkConf(BenchmarkConfig.FILE_FORMAT, "orc") + .setBenchmarkConf(BenchmarkConfig.INDEX_ENABLE, "true") + .setSparkConf("spark.sql.oap.orc.data.cache.enable", "true") + .setSparkConf("spark.sql.orc.copyBatchToSpark", "true"), + new BenchmarkConfig() + .setBenchmarkConfName("Orc w/ index oap binary cache enabled") + .setBenchmarkConf(BenchmarkConfig.FILE_FORMAT, "orc") + .setBenchmarkConf(BenchmarkConfig.INDEX_ENABLE, "true") + .setSparkConf("spark.sql.oap.orc.binary.cache.enable", "true"), + new BenchmarkConfig() + .setBenchmarkConfName("orc w/ index data cache separation same medium enabled") + .setBenchmarkConf(BenchmarkConfig.FILE_FORMAT, "orc") + .setBenchmarkConf(BenchmarkConfig.INDEX_ENABLE, "true") + .setSparkConf("spark.sql.oap.orc.data.cache.enable", "true") + .setSparkConf("spark.sql.orc.copyBatchToSpark", "true") + .setSparkConf("spark.sql.oap.index.data.cache.separation.enable", "true") + .setSparkConf("spark.oap.cache.strategy", "mix"), + new BenchmarkConfig() + .setBenchmarkConfName("orc w/ index binary data cache separation same medium enabled") + .setBenchmarkConf(BenchmarkConfig.FILE_FORMAT, "orc") + .setBenchmarkConf(BenchmarkConfig.INDEX_ENABLE, "true") + .setSparkConf("spark.sql.oap.orc.binary.cache.enable", "true") + .setSparkConf("spark.sql.oap.index.data.cache.separation.enable", "true") + .setSparkConf("spark.oap.cache.strategy", "mix"), + new BenchmarkConfig() + .setBenchmarkConfName("Orc w/ index data cache separation different medium enabled") + .setBenchmarkConf(BenchmarkConfig.FILE_FORMAT, "orc") + .setBenchmarkConf(BenchmarkConfig.INDEX_ENABLE, "true") + .setSparkConf("spark.sql.oap.orc.data.cache.enable", "true") + .setSparkConf("spark.sql.orc.copyBatchToSpark", "true") + .setSparkConf("spark.sql.oap.index.data.cache.separation.enable", "true") + .setSparkConf("spark.sql.oap.fiberCache.memory.manager", "mix") + .setSparkConf("spark.oap.cache.strategy", "mix"), + new BenchmarkConfig() + .setBenchmarkConfName("Orc w/ index data binary cache separation different medium enabled") + .setBenchmarkConf(BenchmarkConfig.FILE_FORMAT, "orc") + .setBenchmarkConf(BenchmarkConfig.INDEX_ENABLE, "true") + .setSparkConf("spark.sql.oap.orc.binary.cache.enable", "true") + .setSparkConf("spark.sql.oap.index.data.cache.separation.enable", "true") + .setSparkConf("spark.sql.oap.fiberCache.memory.manager", "mix") + .setSparkConf("spark.oap.cache.strategy", "mix"), + new BenchmarkConfig() + .setBenchmarkConfName("Orc w/o index") + .setBenchmarkConf(BenchmarkConfig.FILE_FORMAT, "orc") + .setBenchmarkConf(BenchmarkConfig.INDEX_ENABLE, "false"), + new BenchmarkConfig() + .setBenchmarkConfName("Orc w/o index oap cache enabled") + .setBenchmarkConf(BenchmarkConfig.FILE_FORMAT, "orc") + .setBenchmarkConf(BenchmarkConfig.INDEX_ENABLE, "false") + .setSparkConf("spark.sql.oap.orc.data.cache.enable", "true") + .setSparkConf("spark.sql.orc.copyBatchToSpark", "true"), + new BenchmarkConfig() + .setBenchmarkConfName("Orc w/o index oap binary cache enabled") + .setBenchmarkConf(BenchmarkConfig.FILE_FORMAT, "orc") + .setBenchmarkConf(BenchmarkConfig.INDEX_ENABLE, "false") + .setSparkConf("spark.sql.oap.orc.binary.cache.enable", "true"), + new BenchmarkConfig() + .setBenchmarkConfName("parquet w/ index") + .setBenchmarkConf(BenchmarkConfig.FILE_FORMAT, "parquet") + .setBenchmarkConf(BenchmarkConfig.INDEX_ENABLE, "true"), + new BenchmarkConfig() + .setBenchmarkConfName("parquet w/ index oap cache enabled") + .setBenchmarkConf(BenchmarkConfig.FILE_FORMAT, "parquet") + .setBenchmarkConf(BenchmarkConfig.INDEX_ENABLE, "true") + .setSparkConf("spark.sql.oap.parquet.data.cache.enable", "true"), + new BenchmarkConfig() + .setBenchmarkConfName("parquet w/ index oap binary cache enabled") + .setBenchmarkConf(BenchmarkConfig.FILE_FORMAT, "parquet") + .setBenchmarkConf(BenchmarkConfig.INDEX_ENABLE, "true") + .setSparkConf("spark.sql.oap.parquet.binary.cache.enable", "true"), + new BenchmarkConfig() + .setBenchmarkConfName("parquet w/o index") + .setBenchmarkConf(BenchmarkConfig.FILE_FORMAT, "parquet") + .setBenchmarkConf(BenchmarkConfig.INDEX_ENABLE, "false"), + new BenchmarkConfig() + .setBenchmarkConfName("parquet w/o index oap cache enabled") + .setBenchmarkConf(BenchmarkConfig.FILE_FORMAT, "parquet") + .setBenchmarkConf(BenchmarkConfig.INDEX_ENABLE, "false") + .setSparkConf("spark.sql.oap.parquet.data.cache.enable", "true"), + new BenchmarkConfig() + .setBenchmarkConfName("parquet w/o index oap binary cache enabled") + .setBenchmarkConf(BenchmarkConfig.FILE_FORMAT, "parquet") + .setBenchmarkConf(BenchmarkConfig.INDEX_ENABLE, "false") + .setSparkConf("spark.sql.oap.parquet.binary.cache.enable", "true"), + new BenchmarkConfig() + .setBenchmarkConfName("parquet w/ index data cache separation same medium enabled") + .setBenchmarkConf(BenchmarkConfig.FILE_FORMAT, "parquet") + .setBenchmarkConf(BenchmarkConfig.INDEX_ENABLE, "true") + .setSparkConf("spark.sql.oap.parquet.data.cache.enable", "true") + .setSparkConf("spark.sql.oap.index.data.cache.separation.enable", "true") + .setSparkConf("spark.oap.cache.strategy", "mix"), + new BenchmarkConfig() + .setBenchmarkConfName("parquet w/ index binary data cache separation same medium enabled") + .setBenchmarkConf(BenchmarkConfig.FILE_FORMAT, "parquet") + .setBenchmarkConf(BenchmarkConfig.INDEX_ENABLE, "true") + .setSparkConf("spark.sql.oap.parquet.binary.cache.enable", "true") + .setSparkConf("spark.sql.oap.index.data.cache.separation.enable", "true") + .setSparkConf("spark.oap.cache.strategy", "mix"), + new BenchmarkConfig() + .setBenchmarkConfName("parquet w/ index data cache separation different medium enabled") + .setBenchmarkConf(BenchmarkConfig.FILE_FORMAT, "parquet") + .setBenchmarkConf(BenchmarkConfig.INDEX_ENABLE, "true") + .setSparkConf("spark.sql.oap.parquet.data.cache.enable", "true") + .setSparkConf("spark.sql.oap.index.data.cache.separation.enable", "true") + .setSparkConf("spark.sql.oap.fiberCache.memory.manager", "mix") + .setSparkConf("spark.oap.cache.strategy", "mix"), + new BenchmarkConfig() + .setBenchmarkConfName("parquet w/ index binary cache separation different medium enabled") + .setBenchmarkConf(BenchmarkConfig.FILE_FORMAT, "parquet") + .setBenchmarkConf(BenchmarkConfig.INDEX_ENABLE, "true") + .setSparkConf("spark.sql.oap.parquet.binary.cache.enable", "true") + .setSparkConf("spark.sql.oap.index.data.cache.separation.enable", "true") + .setSparkConf("spark.sql.oap.fiberCache.memory.manager", "mix") + .setSparkConf("spark.oap.cache.strategy", "mix") + ) + } +} + + +trait LocalClusterConfigSet extends BenchmarkConfigSelector { + // TODO: choose conf + def allConfigurations: Seq[BenchmarkConfig] = Seq( + new BenchmarkConfig() + .setBenchmarkConfName("local cluster 100m offheap") + .setBenchmarkConf(BenchmarkConfig.FILE_FORMAT, "oap") + .setBenchmarkConf(BenchmarkConfig.INDEX_ENABLE, "true") + .setSparkConf("spark.memory.offHeap.enabled", "true") + .setSparkConf("spark.memory.offHeap.size", "100m") + + ) +} + diff --git a/oap-cache/oap/src/test/oap-perf-suite/scala/org/apache/spark/sql/OapBenchmarkDataBuilder.scala b/oap-cache/oap/src/test/oap-perf-suite/scala/org/apache/spark/sql/OapBenchmarkDataBuilder.scala new file mode 100755 index 000000000..ab2ce7c0e --- /dev/null +++ b/oap-cache/oap/src/test/oap-perf-suite/scala/org/apache/spark/sql/OapBenchmarkDataBuilder.scala @@ -0,0 +1,191 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.spark.sql + +import java.io.FileNotFoundException + +import scala.collection.mutable + +// import com.databricks.spark.sql.perf.tpcds.Tables +import org.apache.hadoop.conf.Configuration +import org.apache.hadoop.fs.{FileSystem, FileUtil, Path} + +import org.apache.spark.internal.Logging +import org.apache.spark.sql.functions._ +import org.apache.spark.util.Utils + + + +object OapBenchmarkDataBuilder extends OapPerfSuiteContext with Logging { + + private val defaultProperties = Map( + "oap.benchmark.hdfs.file.root.dir" -> "/dailytest", + "oap.benchmark.tpcds.data.scale" -> "200", + "oap.benchmark.tpcds.data.partition" -> "80", + "oap.benchmark.tpcds.data.format" -> "parquet" + ) + + def getDatabase(format: String) : String = { + val dataScale = properties.get("oap.benchmark.tpcds.data.scale").get.toInt + val baseName = format match { + case "parquet" => s"parquet$dataScale" + case "orc" => s"orc$dataScale" + case _ => "default" + } + + baseName + } + + def formatTableLocation(rootDir: String, tableFormat: String): String = { + s"${rootDir}/${getDatabase(tableFormat)}/" + } + + private val properties = { + try { + new mutable.HashMap[String, String]() ++= + Utils.getPropertiesFromFile("./src/test/oap-perf-suite/conf/oap-benchmark-default.conf") + } catch { + case e: IllegalArgumentException => { + logWarning(e.getMessage + ". Use default setting!") + defaultProperties + } + } + + } + + override def beforeAll(conf: Map[String, String] = Map.empty): Unit = { + super.beforeAll(conf) + } + +// def generateTables(): Unit = { +// val versionNum = properties.get("oap.benchmark.support.oap.version").get +// val codec = properties.get("oap.benchmark.compression.codec").get +// val scale = properties.get("oap.benchmark.tpcds.data.scale").get.toInt +// val partitions = properties.get("oap.benchmark.tpcds.data.partition").get.toInt +// val hdfsRootDir = properties.get("oap.benchmark.hdfs.file.root.dir").get +// val tpcdsToolPath = properties.get("oap.benchmark.tpcds.tool.dir").get +// val dataFormats = properties.get("oap.benchmark.tpcds.data.format").get.split(",", 0) +// +// dataFormats.foreach{ format => +// sqlContext.setConf(s"spark.sql.$format.compression.codec", codec) +// val loc = formatTableLocation(hdfsRootDir, versionNum, format) +// val tables = new Tables(sqlContext, tpcdsToolPath, scale) +// tables.genData( +// loc, format, true, false, true, false, false, "store_sales", partitions) +// } +// } +// +// def generateDatabases() { +// // TODO: get from OapFileFormatConfigSet +// val dataFormats = properties.get("oap.benchmark.tpcds.data.format").get.split(",", 0) +// dataFormats.foreach { format => +// spark.sql(s"create database if not exists ${getDatabase(format)}") +// } +// +// def genData(dataFormat: String) = { +// val versionNum = properties.get("oap.benchmark.support.oap.version").get +// val hdfsRootDir = properties.get("oap.benchmark.hdfs.file.root.dir").get +// val dataLocation = formatTableLocation(hdfsRootDir, versionNum, dataFormat) +// +// spark.sql(s"use ${getDatabase(dataFormat)}") +// spark.sql("drop table if exists store_sales") +// spark.sql("drop table if exists store_sales_dup") +// +// /** +// * To compare performance between B-Tree and Bitmap index, we generate duplicate +// * tables of store_sales here. Besides, store_sales_dup table can be used in testing +// * OAP strategies. +// */ +// val df = spark.read.format(dataFormat).load(dataLocation + "store_sales") +// val divRatio = df.select("ss_item_sk").orderBy(desc("ss_item_sk")).limit(1). +// collect()(0)(0).asInstanceOf[Int] / 1000 +// val divideUdf = udf((s: Int) => s / divRatio) +// df.withColumn("ss_item_sk1", divideUdf(col("ss_item_sk"))).write.format(dataFormat) +// .mode(SaveMode.Overwrite).save(dataLocation + "store_sales1") +// +// val conf = new Configuration() +// val hadoopFs = FileSystem.get(conf) +// hadoopFs.delete(new Path(dataLocation + "store_sales"), true) +// +// // Notice here delete source flag should firstly be set to false +// FileUtil.copy(hadoopFs, new Path(dataLocation + "store_sales1"), +// hadoopFs, new Path(dataLocation + "store_sales"), false, conf) +// FileUtil.copy(hadoopFs, new Path(dataLocation + "store_sales1"), +// hadoopFs, new Path(dataLocation + "store_sales_dup"), true, conf) +// +// sqlContext.createExternalTable("store_sales", dataLocation + "store_sales", dataFormat) +// sqlContext.createExternalTable("store_sales_dup", dataLocation + "store_sales_dup" +// , dataFormat) +// logWarning(s"File size of original table store_sales in $dataFormats format: " + +// TestUtil.calculateFileSize("store_sales", dataLocation, dataFormat) +// ) +// logWarning("Records of table store_sales: " + +// spark.read.format(dataFormat).load(dataLocation + "store_sales").count() +// ) +// } +// +// dataFormats.foreach(genData) +// } +// +// def buildAllIndex() { +// def buildBtreeIndex(tablePath: String, table: String, attr: String): Unit = { +// try { +// spark.sql(s"DROP OINDEX ${table}_${attr}_index ON $table") +// } catch { +// case _: Throwable => logWarning("Index doesn't exist, so don't need to drop here!") +// } finally { +// TestUtil.time( +// spark.sql( +// s"CREATE OINDEX IF NOT EXISTS ${table}_${attr}_index ON $table ($attr) USING BTREE" +// ), +// s"Create B-Tree index on ${table}(${attr}) cost " +// ) +// logWarning(s"The size of B-Tree index on ${table}(${attr}) cost:" + +// TestUtil.calculateIndexSize(table, tablePath, attr)) +// } +// } +// +// def buildBitmapIndex(tablePath: String, table: String, attr: String): Unit = { +// try { +// spark.sql(s"DROP OINDEX ${table}_${attr}_index ON $table") +// } catch { +// case _: Throwable => logWarning("Index doesn't exist, so don't need to drop here!") +// } finally { +// TestUtil.time( +// spark.sql( +// s"CREATE OINDEX IF NOT EXISTS ${table}_${attr}_index ON $table ($attr) USING BITMAP" +// ), +// s"Create Bitmap index on ${table}(${attr}) cost" +// ) +// logWarning(s"The size of Bitmap index on ${table}(${attr}) cost:" + +// TestUtil.calculateIndexSize(table, tablePath, attr)) +// } +// } +// +// val versionNum = properties.get("oap.benchmark.support.oap.version").get +// val hdfsRootDir = properties.get("oap.benchmark.hdfs.file.root.dir").get +// val dataFormats = properties.get("oap.benchmark.tpcds.data.format").get.split(",", 0) +// +// dataFormats.foreach { dataFormat => { +// spark.sql(s"use ${getDatabase(dataFormat)}") +// val tableLocation: String = formatTableLocation(hdfsRootDir, versionNum, dataFormat) +// buildBtreeIndex(tableLocation, "store_sales", "ss_customer_sk") +// buildBitmapIndex(tableLocation, "store_sales", "ss_item_sk1") +// } +// } +// } +} diff --git a/oap-cache/oap/src/test/oap-perf-suite/scala/org/apache/spark/sql/OapPerfSuite.scala b/oap-cache/oap/src/test/oap-perf-suite/scala/org/apache/spark/sql/OapPerfSuite.scala new file mode 100755 index 000000000..de6cb4004 --- /dev/null +++ b/oap-cache/oap/src/test/oap-perf-suite/scala/org/apache/spark/sql/OapPerfSuite.scala @@ -0,0 +1,167 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Entry Point + */ +package org.apache.spark.sql + +import org.reflections.Reflections +import scala.collection.mutable +import scala.reflect.runtime.universe + +import org.apache.spark.internal.Logging +import org.apache.spark.sql.suites.LocalSparkMasterTestSuite + + + + + +object OapPerfSuite extends Logging { + + // register all suite + { + val reflections = new Reflections("org.apache.spark.sql") + val allSubSuiteTypes = reflections.getSubTypesOf(classOf[OapTestSuite]) + allSubSuiteTypes.toArray.foreach { suiteType => + val runtimeMirror = universe.runtimeMirror(getClass.getClassLoader) + val module = runtimeMirror.staticModule(suiteType.asInstanceOf[Class[OapTestSuite]].getName) + val obj = runtimeMirror.reflectModule(module) + val suite = obj.instance.asInstanceOf[OapTestSuite] + + // skip bootstrapping tests. + if (!suite.isBootStrapping) { + BenchmarkSuiteSelector.registerSuite(suite) + } + } + } + + private val usage = + """ + | /** + | * OapBenchmark + | * -c Config (oap/parquet, index/non-index, etc.): + | * -p Profile (Full, BigData, SmallData, etc.): + | * -s SuiteName (All, etc.): + | * -t TestName (All, etc.): + | * -d Datagen: build data for all test. + | * -r Repeat(3): + | * -bootstrapping: self test without cluster support. + | */ + """.stripMargin + + /** + * OapBenchmark + * -c Config (oap/parquet, index/non-index, etc.): + * -p Profile (Full, BigData, SmallData, etc.): + * -s SuiteName (All, etc.): + * -t TestName (All, etc.): + * -d Datagen: gen data for test. + * -r Repeat(3): + * -b bootstrapping: self test without cluster support. + * TODO: -Dkey.conf=value + */ + def main(args: Array[String]): Unit = { + // TODO: use scala getOpts + if (args.isEmpty) sys.error(usage) + + var i = 0 + var repeat = 3 + while (i < args.length){ + args(i) match { + case "-suite" | "-s" => { + assert(args.length > i + 1) + BenchmarkSuiteSelector.build(args(i + 1)) + i += 2 + } + case "-config" | "-c" => { + // TODO: regex check: -c a=b;c=d;e=f + assert(args.length > i + 1) + val options: mutable.HashMap[String, String] = mutable.HashMap.empty + args(i + 1).split(';').map{_.split('=')}.foreach{ kv => + options ++= Map(kv(0)->kv(1)) + } + BenchmarkConfigSelector.build(options.toMap) + i += 2 + } + case "-test" | "-t" => { + assert(args.length > i + 1) + BenchmarkTestSelector.build(args(i + 1)) + i += 2 + } + case "-repeat" | "-r" => { + assert(args.length > i + 1) + repeat = args(i + 1).toInt + i += 2 + } +// case "-datagen" | "-d" => { +// OapBenchmarkDataBuilder.beforeAll() +// OapBenchmarkDataBuilder.generateTables() +// OapBenchmarkDataBuilder.generateDatabases() +// OapBenchmarkDataBuilder.buildAllIndex() +// OapBenchmarkDataBuilder.afterAll() +// // if run with -d only +// if(i == 0 && args.length == 1){ +// sys.exit() +// } else { +// i += 1 +// } +// } + case "-bootstrapping" => { + // self test. + assert(args.length == i + 1, "bootstrapping works alone.") + runSuite(LocalSparkMasterTestSuite, 3) + i += 1 + sys.exit(1) + } + case _ => sys.error(usage) + } + } + + BenchmarkSuiteSelector.selectedSuites().foreach{suite => runSuite(suite, repeat)} + } + + /** + * Run a suite + * @param suite: OapTestSuite knows how to run itself and + * give a report. + */ + def runSuite(suite: OapTestSuite, repeat: Int = 3): Unit = { + suite.allConfigurations + .filter(BenchmarkConfigSelector.isSelected(_)) + .foreach{ conf => + suite.runWith(conf){ + logWarning(s"running $suite with conf($conf).") + if (BenchmarkTestSelector.selectedTests().nonEmpty) { + BenchmarkTestSelector.selectedTests().foreach{ + suite.run(_, repeat) + } + } else { + suite.runAll(repeat) + } + } + } + + val res = suite.resultMap.toSeq + if (res.nonEmpty) { + // Object's Name is XXXXX$, so remove this $ + // TODO: check if $ exists. + println("#"+ suite.getClass.getCanonicalName.dropRight(1)) + TestUtil.formatResults(res) + } + } +} diff --git a/oap-cache/oap/src/test/oap-perf-suite/scala/org/apache/spark/sql/OapPerfSuiteContext.scala b/oap-cache/oap/src/test/oap-perf-suite/scala/org/apache/spark/sql/OapPerfSuiteContext.scala new file mode 100755 index 000000000..ddf0d334b --- /dev/null +++ b/oap-cache/oap/src/test/oap-perf-suite/scala/org/apache/spark/sql/OapPerfSuiteContext.scala @@ -0,0 +1,97 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Benchmark Session. + */ +package org.apache.spark.sql + +import sys.process._ +import org.apache.spark._ + +trait OapPerfSuiteContext { + + /** + * The [[SparkSession]] to use for all tests in this suite. + * + * By default, the underlying [[org.apache.spark.SparkContext]] will be run in local + * mode with the default test configurations. + */ + private var _spark: SparkSession = null + + /** + * The [[SparkSession]] to use for all tests in this suite. + */ + protected implicit def spark: SparkSession = _spark + + /** + * The [[SparkSession]] to use for all tests in this suite. + */ + protected implicit def sqlContext: SQLContext = _spark.sqlContext + + def isBootStrapping = false + protected def createSparkSession(conf: Map[String, String] = Map.empty): SparkSession = { + if (isBootStrapping) { + val sparkConf = new SparkConf().set("spark.sql.testkey", "true") + conf.foreach(option => sparkConf.set(option._1, option._2)) + + new SparkSession( + // TODO: support s"local-cluster[2, 1, ${4*1024*1024}]", + new SparkContext( + "local[2]", + "test-sql-context", + sparkConf + ) + ) + } else { + val builder = SparkSession.builder().appName(getAppName) + conf.foreach(option => builder.config(option._1, option._2)) + builder.enableHiveSupport().getOrCreate() + } + } + + protected def getAppName: String = "defaultTest" + + /** + * Initialize the [[SparkSession]]. + */ + def beforeAll(conf: Map[String, String] = Map.empty): Unit = { + if (_spark == null) { + _spark = createSparkSession(conf) + SparkSession.setActiveSession(_spark) + } + } + + /** + * Stop the underlying [[org.apache.spark.SparkContext]], if any. + */ + def afterAll(): Unit = { + if (_spark != null) { + SparkSession.clearActiveSession() + _spark.stop() + _spark = null + assert(("rm -f ./metastore_db/db.lck" !) == 0) + assert(("rm -f ./metastore_db/dbex.lck" !) == 0) + assert(("sync" !) == 0) + + if (!isBootStrapping) { + val dropCacheResult = Seq("bash", "-c", "echo 3 > /proc/sys/vm/drop_caches").! + assert(dropCacheResult == 0) + } + } + } +} diff --git a/oap-cache/oap/src/test/oap-perf-suite/scala/org/apache/spark/sql/OapTestSuite.scala b/oap-cache/oap/src/test/oap-perf-suite/scala/org/apache/spark/sql/OapTestSuite.scala new file mode 100755 index 000000000..1b862660e --- /dev/null +++ b/oap-cache/oap/src/test/oap-perf-suite/scala/org/apache/spark/sql/OapTestSuite.scala @@ -0,0 +1,157 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Base interface for test suite. + */ +package org.apache.spark.sql + +import org.apache.spark.internal.Logging + +import scala.collection.mutable +import scala.collection.mutable.ArrayBuffer + +import sys.process._ + +abstract class OapTestSuite extends BenchmarkConfigSelector with OapPerfSuiteContext with Logging { + + // Class information + case class OapBenchmarkTest(_name: String, _sentence: String, _profile: String = "Benchmark") { + def name = _name + def sql = _sentence + def profile = _profile + } + + def activeConf: BenchmarkConfig = { + if (_activeConf.isEmpty) { + assert(false, "No active configuration found!") + } + _activeConf.get + } + + def allTests(): Seq[OapBenchmarkTest] = testSet + + def runAll(repCount: Int): Unit = { + testSet.foreach{ + run(_, repCount) + } + } + + def run(name: String, repCount: Int): Unit = { + testSet.filter(_.name == name).foreach{ + run(_, repCount) + } + } + + def run(test: OapBenchmarkTest, repCount: Int): Unit = { + logWarning(s"running ${test.name} ($repCount times) ...") + val result = (1 to repCount).map{ _ => + dropCache() + TestUtil.queryTime(spark.sql(test.sql).foreach{ _ => }) + }.toArray + + val prev: Seq[(String, Array[Int])] = _resultMap.getOrElse(test.name, Nil) + val curr = prev :+ (activeConf.toString, result) + _resultMap.put(test.name, curr) + } + + private var _activeConf: Option[BenchmarkConfig] = None + def runWith(conf: BenchmarkConfig)(body: => Unit): Unit = { + _activeConf = Some(conf) + beforeAll(conf.allSparkOptions()) + if (prepare()){ + body + } else { + assert(false, s"$this checkCondition Failed!") + } + afterAll() + _activeConf = None + } + + /** + * Prepare running env, include data check, various settings + * of current(active) benchmark config. + * + * @return true if success + */ + def prepare(): Boolean + + /** + * Final table may look like: + * +--------+--------+--------+--------+--------+ + * | | | T1 | TN |Avg/Med | + * +--------+--------+--------+--------+--------+ + * | |config1 | | | | + * + Test1 +--------+--------+--------+--------+ + * | |config2 | | | | + * +--------+--------+--------+--------+--------+ + * | |config1 | | | | + * + Test2 +--------+--------+--------+--------+ + * | |config2 | | | | + * +--------+--------+--------+--------+--------+ + * + * resultMap: (Test1 -> Seq( (config1, (1, 2, 3, ...)), + * (config2, (1, 2, 3, ...))), + * Test2 -> Seq( (config1, (1, 2, 3, ...)), + * (config2, (1, 2, 3, ...))), + * ...) + */ + private val _resultMap: mutable.LinkedHashMap[String, Seq[(String, Array[Int])]] = + new mutable.LinkedHashMap[String, Seq[(String, Array[Int])]] + + def resultMap = _resultMap + + protected def testSet: Seq[OapBenchmarkTest] + protected def dropCache(): Unit = { + val nodes = spark.sparkContext.getExecutorMemoryStatus.map(_._1.split(":")(0)) + nodes.foreach { node => + val dropCacheResult = Seq("bash", "-c", s"""ssh $node "echo 3 > /proc/sys/vm/drop_caches"""").! + assert(dropCacheResult == 0) + } + } + +} + +object BenchmarkSuiteSelector extends Logging{ + + private val allRegisterSuites = new ArrayBuffer[OapTestSuite]() + + def registerSuite(suite: OapTestSuite) = { + allRegisterSuites.append(suite) + logWarning(s"Register $suite") + } + + def allSuites: Seq[OapTestSuite] = allRegisterSuites + + var wildcardSuite: Option[String] = None + + def build(name: String): Unit = wildcardSuite = Some(name) + + // TODO: regex support + def selectedSuites(): Seq[OapTestSuite] = wildcardSuite match { + case Some(name) =>allRegisterSuites.filter(_.toString.contains(name)) + case None => allRegisterSuites + } +} + +object BenchmarkTestSelector { + private var wildcardTest: Seq[String] = Seq.empty + + def build(name: String): Unit = wildcardTest = name.split(';') + + def selectedTests(): Seq[String] = wildcardTest +} diff --git a/oap-cache/oap/src/test/oap-perf-suite/scala/org/apache/spark/sql/TestUtil.scala b/oap-cache/oap/src/test/oap-perf-suite/scala/org/apache/spark/sql/TestUtil.scala new file mode 100755 index 000000000..41a1a00f1 --- /dev/null +++ b/oap-cache/oap/src/test/oap-perf-suite/scala/org/apache/spark/sql/TestUtil.scala @@ -0,0 +1,144 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.spark.sql + +import org.apache.hadoop.conf.Configuration +import org.apache.hadoop.fs.{Path, PathFilter} + +object TestUtil { + val conf = new Configuration() + val oapDataFilter = new PathFilter { + override def accept(path: Path) = path.getName.endsWith(".data") + } + val parquetDataFilter = new PathFilter { + override def accept(path: Path) = path.getName.endsWith(".parquet") + } + + + def convertFileSize(size: Long): String = { + val kb: Long = 1024 + val mb: Long = kb * 1024 + val gb: Long = mb * 1024 + val tb: Long = gb * 1024 + + if (size >= tb) "%.1f TB".format(size.toFloat / tb) + else if (size >= gb) { + val f = size.toFloat / gb + (if (f > 100) "%.0f GB" else "%.1f GB").format(f) + } else if (size >= mb) { + val f = size.toFloat / mb + (if (f > 100) "%.0f MB" else "%.1f MB").format(f) + } else if (size >= kb) { + val f = size.toFloat / kb + (if (f > 100) "%.0f KB" else "%.1f KB").format(f) + } else "%d B".format(size) + } + + def calculateIndexSize(tableName: String, tablePath: String, attr: String): String = { + val path = new Path(tablePath + tableName) + val indexFilter = new PathFilter { + override def accept(path: Path) = path.getName.endsWith(s"${attr}_index.index") + } + val size = path.getFileSystem(conf).listStatus(path, indexFilter).map(_.getLen).sum + convertFileSize(size) + } + def calculateFileSize(tableName: String, tablePath: String, format: String) : String = { + val path = new Path(tablePath + tableName) + val size = path.getFileSystem(conf).listStatus(path, + if (format == "oap") oapDataFilter else parquetDataFilter).map(_.getLen).sum + convertFileSize(size) + } + + def calculateFileSize(path: Path, filter: PathFilter): String = { + val size = path.getFileSystem(conf).listStatus(path, filter).map(_.getLen).sum + convertFileSize(size) + } + + def time[T](code: => T, action: String): Unit = { + val t0 = System.nanoTime + code + val t1 = System.nanoTime + println(action + ((t1 - t0) / 1000000) + "ms") + } + + def queryTime[T](code: => T): Int = { + val t0 = System.nanoTime + code + val t1 = System.nanoTime + ((t1 - t0) / 1000000).toInt + } + + def median(s: Seq[Int]): Int = { + val sortSeq = s.sortWith(_ < _) + if (sortSeq.length % 2 == 0) (sortSeq(sortSeq.length / 2 - 1) + sortSeq(sortSeq.length / 2)) / 2 + else sortSeq(sortSeq.length / 2) + } + + def formatResults(resultSet: Seq[(String, Seq[(String, Array[Int])])]): Unit = { + assert(resultSet.nonEmpty) + + resultSet.foreach{ result => + val header = + Seq(("%" + Tabulator.MAX_WIDTH + "s").format(Tabulator.truncate(result._1))) ++ + (1 to result._2(0)._2.length).map("T" + _ +"/ms") ++ + Seq("Median/ms") + val content = result._2.map(x => + Seq(Tabulator.truncate(x._1)) ++ + x._2.map(_.toString) ++ + Seq(median(x._2).toString) + ) + println(Tabulator.format(Seq(header) ++ content)) + } + } +} + +// TODO: use DataSet.show()?? +object Tabulator { + val MAX_WIDTH = 64 + + def truncate(value: String, length: Int = MAX_WIDTH): String = { + assert(length > 3) + if (value != null && value.length > length) value.substring(0, length - 3) + "..." + else value + } + + def format(table: Seq[Seq[Any]]): String = table match { + case Seq() => "" + case _ => + val sizes = for (row <- table) yield (for (cell <- row) yield + if (cell == null) 0 else cell.toString.length) + val colSizes = for (col <- sizes.transpose) yield col.max + val rows = for (row <- table) yield formatRow(row, colSizes) + formatRows(rowSeparator(colSizes), rows) + } + + def formatRows(rowSeparator: String, rows: Seq[String]): String = ( + rowSeparator :: + rows.head :: + rowSeparator :: + rows.tail.toList ::: + rowSeparator :: + List()).mkString("\n") + + def formatRow(row: Seq[Any], colSizes: Seq[Int]): String = { + val cells = for ((item, size) <- row.zip(colSizes)) yield + if (size == 0) "" else ("%" + size + "s").format(item) + cells.mkString("|", "|", "|") + } + + private def rowSeparator(colSizes: Seq[Int]) = colSizes map { "-" * _ } mkString("+", "+", "+") +} diff --git a/oap-cache/oap/src/test/oap-perf-suite/scala/org/apache/spark/sql/suites/BitmapIndexSuite.scala b/oap-cache/oap/src/test/oap-perf-suite/scala/org/apache/spark/sql/suites/BitmapIndexSuite.scala new file mode 100755 index 000000000..6ef79c951 --- /dev/null +++ b/oap-cache/oap/src/test/oap-perf-suite/scala/org/apache/spark/sql/suites/BitmapIndexSuite.scala @@ -0,0 +1,107 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.spark.sql.suites + +import org.apache.spark.sql.{BenchmarkConfig, OapBenchmarkDataBuilder, OapTestSuite, ParquetVsOrcConfigSet} +import org.apache.spark.sql.internal.oap.OapConf + +object BitmapIndexSuite extends OapTestSuite with ParquetVsOrcConfigSet { + override protected def getAppName: String = "BitmapIndexBenchmarkSuite" + + private val table = "store_sales" + + private val attr = "ss_item_sk1" + + private val range1to10 = (1 to 10).mkString(",") + + private val range1to5 = (1 to 5).mkString(",") + + private def databaseName = + OapBenchmarkDataBuilder.getDatabase(activeConf.getBenchmarkConf(BenchmarkConfig.FILE_FORMAT)) + + private def isDataBaseExists: Boolean = { + if (spark.sqlContext.sql(s"show databases").collect().exists(_.getString(0) == databaseName)) { + spark.sqlContext.sql(s"USE $databaseName") + true + } else { + sys.error(s"ERROR: $databaseName does not exist!") + false + } + } + + private def isTableReady: Boolean = { + if (spark.sqlContext.sql(s"show tables").collect().exists(_.getString(1) == table)) { + val conf = activeConf + if (conf.getBenchmarkConf(BenchmarkConfig.INDEX_ENABLE) == "true") { + if (spark.sqlContext.sql(s"show oindex from $table") + .collect().exists(_.getString(3) == attr)) { + true + } else { + sys.error(s"ERROR: index on $attr does not exist!") + false + } + } else { + true + } + } else { + sys.error(s"ERROR: table $table does not exist!") + false + } + } + + private def isDataReady(): Boolean = isDataBaseExists && isTableReady + + private def setRunningParams(): Boolean = { + val conf = activeConf + if (conf.getBenchmarkConf(BenchmarkConfig.INDEX_ENABLE) == "false") { + spark.sqlContext.conf.setConf(OapConf.OAP_ENABLE_OINDEX, false) + } + + spark.sqlContext.sql(s"USE $databaseName") + true + } + + override def prepare(): Boolean = { + if (isDataReady()) { + setRunningParams() + } else { + false + } + } + + /** + * (name, sql sentence, TODO: profile, etc) + */ + override def testSet = Seq( + OapBenchmarkTest("attr in range1to10", + s"SELECT * FROM $table WHERE $attr in ( $range1to10 )"), + OapBenchmarkTest("attr in range1to5", + s"SELECT * FROM $table WHERE $attr in ( $range1to5 )"), + // Two columns query + OapBenchmarkTest("attr in range1to10 & ss_customer_sk >= 120000", + s"SELECT * FROM $table WHERE $attr in ( $range1to10 ) AND ss_customer_sk >= 120000"), + OapBenchmarkTest("attr in range1to5 & ss_list_price < 100.0", + s"SELECT * FROM $table WHERE $attr in ( $range1to5 ) AND ss_list_price < 100.0"), + // Three columns query + OapBenchmarkTest("attr in range1to10 & ss_customer_sk >= 120000 & ss_list_price < 100.0", + s"SELECT * FROM $table WHERE $attr in ( $range1to10 ) AND ss_customer_sk >= 120000 AND ss_list_price < 100.0"), + OapBenchmarkTest("attr in range1to5 & ss_list_price < 100.0 & ss_net_paid > 500.0", + s"SELECT * FROM $table WHERE $attr in ( $range1to5 ) AND ss_list_price < 100.0 AND ss_net_paid > 500.0"), + OapBenchmarkTest("attr=10 & ss_net_paid>100.0 & ss_net_paid<200.0 & ss_list_price<100.0", + s"SELECT * FROM $table WHERE $attr = 10 AND ss_net_paid > 100.0 AND ss_net_paid < 200.0 AND ss_list_price < 100.0") + ) +} diff --git a/oap-cache/oap/src/test/oap-perf-suite/scala/org/apache/spark/sql/suites/BtreeIndexSuite.scala b/oap-cache/oap/src/test/oap-perf-suite/scala/org/apache/spark/sql/suites/BtreeIndexSuite.scala new file mode 100755 index 000000000..564491bbe --- /dev/null +++ b/oap-cache/oap/src/test/oap-perf-suite/scala/org/apache/spark/sql/suites/BtreeIndexSuite.scala @@ -0,0 +1,105 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.spark.sql.suites + +import org.apache.spark.sql._ +import org.apache.spark.sql.internal.oap.OapConf + +object BtreeIndexSuite + extends OapTestSuite with OapPerfSuiteContext with ParquetVsOrcConfigSet { + override protected def getAppName: String = "BtreeIndexBenchmarkSuite" + + val table = "store_sales" + + val attr = "ss_customer_sk" + + def databaseName = + OapBenchmarkDataBuilder.getDatabase(activeConf.getBenchmarkConf(BenchmarkConfig.FILE_FORMAT)) + + private def isDataBaseReady: Boolean = { + val dbCandidates = spark.sqlContext.sql(s"show databases").collect() + if (dbCandidates.exists(_.getString(0) == databaseName)) { + spark.sqlContext.sql(s"USE $databaseName") + true + } else { + logError(s"$dbCandidates does not contain $databaseName!") + false + } + } + + private def isTableReady: Boolean = { + val tables = spark.sqlContext.sql(s"show tables").collect() + if (tables.exists(_.getString(1) == table)) { + val conf = activeConf + if (conf.getBenchmarkConf(BenchmarkConfig.INDEX_ENABLE) == "true"){ + // Check if index exists. + spark.sqlContext.sql(s"show oindex from $table").collect().exists(_.getString(3) == attr) + } else { + true + } + } else { + logError(s"$tables does not contain $table!") + false + } + } + + private def isDataReady(): Boolean = isDataBaseReady && isTableReady + + private def setRunningParams(): Boolean = { + val conf = activeConf + if (conf.getBenchmarkConf(BenchmarkConfig.INDEX_ENABLE) == "false"){ + spark.sqlContext.conf.setConf(OapConf.OAP_ENABLE_OINDEX, false) + } + + true + } + + override def prepare(): Boolean = { + if (isDataReady()) { + setRunningParams() + } else { + sys.error("ERROR: Data is not ready!") + false + } + } + + /** + * (name, sql sentence, TODO: profile, etc) + */ + override def testSet = Seq( + OapBenchmarkTest("attr < Int.MaxValue", + s"SELECT * FROM $table WHERE $attr < ${Int.MaxValue}"), + OapBenchmarkTest("attr < 100000", + s"SELECT * FROM $table WHERE $attr < 100000"), + OapBenchmarkTest("attr = 600000", + s"SELECT * FROM $table WHERE $attr = 600000"), + OapBenchmarkTest("attr BETWEEN 10 & 80", + s"SELECT * FROM $table WHERE $attr BETWEEN 10 AND 80"), + // Two columns query + OapBenchmarkTest("attr < 100000 & ss_ticket_number >= 120000", + s"SELECT * FROM $table WHERE $attr < 100000 AND ss_ticket_number >= 120000"), + OapBenchmarkTest("attr < 10000 & ss_list_price < 100.0", + s"SELECT * FROM $table WHERE $attr < 10000 AND ss_list_price < 100.0"), + // Three columns query + OapBenchmarkTest("attr < 100000 & ss_ticket_number >= 120000 & ss_list_price < 100.0", + s"SELECT * FROM $table WHERE $attr < 100000 AND ss_ticket_number >= 120000 AND ss_list_price < 100.0"), + OapBenchmarkTest("attr < 10000 & ss_list_price < 100.0 & ss_net_paid > 500.0", + s"SELECT * FROM $table WHERE $attr < 10000 AND ss_list_price < 100.0 AND ss_net_paid > 500.0"), + OapBenchmarkTest("attr < 1000 & ss_net_paid > 100.0 & ss_net_paid < 110.0 & ss_list_price < 100.0", + s"SELECT * FROM $table WHERE $attr < 1000 AND ss_net_paid > 100.0 AND ss_net_paid < 110.0 AND ss_list_price < 100.0") + ) +} diff --git a/oap-cache/oap/src/test/oap-perf-suite/scala/org/apache/spark/sql/suites/LocalSparkMasterTestSuite.scala b/oap-cache/oap/src/test/oap-perf-suite/scala/org/apache/spark/sql/suites/LocalSparkMasterTestSuite.scala new file mode 100755 index 000000000..914128ff0 --- /dev/null +++ b/oap-cache/oap/src/test/oap-perf-suite/scala/org/apache/spark/sql/suites/LocalSparkMasterTestSuite.scala @@ -0,0 +1,93 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.spark.sql.suites + +import org.apache.spark.sql._ +import org.apache.spark.util.Utils + +object LocalSparkMasterTestSuite extends OapTestSuite with LocalClusterConfigSet { + + protected object testImplicits extends SQLImplicits { + protected override def _sqlContext: SQLContext = spark.sqlContext + } + import testImplicits._ + + override def isBootStrapping: Boolean = true + + override protected def getAppName: String = "LocalSparkMasterTestSuite" + + def isDataExists(): Boolean = { + val db = spark.sqlContext.sql(s"show databases").collect() + if (db.exists(_.getString(0) == databaseName)) { + spark.sqlContext.sql(s"USE $databaseName") + val table = spark.sqlContext.sql(s"show tables").collect() + if (table.exists(_.getString(1) == tableName)) true + else false + } else { + false + } + } + + def prepareData(): Boolean = { + val conf = activeConf + val path = Utils.createTempDir().getAbsolutePath + val format = conf.getBenchmarkConf(BenchmarkConfig.FILE_FORMAT) + val data: Seq[(Int, Int)] = (1 to 3000).map { i => (i, i)} + data.toDF("rowId", attr).write.mode("overwrite").format(format).save(path) + spark.sqlContext.sql(s"CREATE DATABASE IF NOT EXISTS $databaseName") + spark.sqlContext.createExternalTable(s"$databaseName.$tableName", path, format) + + // TODO: Drop table and index after all. + if (conf.getBenchmarkConf(BenchmarkConfig.INDEX_ENABLE).toBoolean) { + spark.sql(s"create oindex ${attr}_index on $databaseName.$tableName ($attr)") + } + + spark.sqlContext.sql(s"USE $databaseName") + true + } + + def databaseName = { + val conf = activeConf + conf.getBenchmarkConf(BenchmarkConfig.FILE_FORMAT) match { + case "parquet" => "parquet_base" + case "oap" => "oap_target" + case _ => "default" + } + } + + val tableName = "test_table" + + val attr = "test_column" + + override def prepare(): Boolean = { + if (isDataExists()) { + true + } else { + prepareData() + } + } + + override def testSet = Seq( + OapBenchmarkTest("eq = 1", + s"SELECT * FROM $tableName WHERE $attr = 1"), + OapBenchmarkTest("eq = 2", + s"SELECT * FROM $tableName WHERE $attr = 2"), + OapBenchmarkTest("check how many char can be displayed in name column-54--------64-----", + s"SELECT * FROM $tableName WHERE $attr = 2") + + ) +} diff --git a/oap-cache/oap/src/test/scala/org/apache/spark/sql/execution/datasources/oap/filecache/MemoryManagerConfigSuite.scala b/oap-cache/oap/src/test/scala/org/apache/spark/sql/execution/datasources/oap/filecache/MemoryManagerConfigSuite.scala new file mode 100644 index 000000000..b57ef9338 --- /dev/null +++ b/oap-cache/oap/src/test/scala/org/apache/spark/sql/execution/datasources/oap/filecache/MemoryManagerConfigSuite.scala @@ -0,0 +1,172 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.execution.datasources.oap.filecache + +import org.apache.spark.SparkEnv +import org.apache.spark.internal.Logging +import org.apache.spark.sql.internal.oap.OapConf +import org.apache.spark.sql.test.oap.SharedOapContext + +class MemoryManagerConfigSuite extends SharedOapContext with Logging{ + + override def afterAll(): Unit = { + // restore oapSparkConf to default + oapSparkConf.set("spark.oap.cache.strategy", "guava") + oapSparkConf.set("spark.sql.oap.fiberCache.memory.manager", "offheap") + oapSparkConf.set("spark.sql.oap.mix.data.cache.backend", "guava") + } + + test("guava cache with offheap memory manager") { + val sparkEnv = SparkEnv.get + sparkEnv.conf.set("spark.oap.cache.strategy", "guava") + sparkEnv.conf.set("spark.sql.oap.fiberCache.memory.manager", "offheap") + val memoryManager = MemoryManager(sparkEnv) + assert(memoryManager.isInstanceOf[OffHeapMemoryManager]) + + } + + test("guava cache with pm memory manager") { + val sparkEnv = SparkEnv.get + sparkEnv.conf.set("spark.oap.cache.strategy", "guava") + sparkEnv.conf.set("spark.sql.oap.fiberCache.memory.manager", "pm") + val memoryManager = MemoryManager(sparkEnv) + assert(memoryManager.isInstanceOf[PersistentMemoryManager]) + } + + test("vmem with tmp memory manager") { + val sparkEnv = SparkEnv.get + sparkEnv.conf.set("spark.oap.cache.strategy", "vmem") + val memoryManager = MemoryManager(sparkEnv) + assert(memoryManager.isInstanceOf[TmpDramMemoryManager]) + } + + test("vmem with memory manager set to pm") { + val sparkEnv = SparkEnv.get + sparkEnv.conf.set("spark.oap.cache.strategy", "vmem") + sparkEnv.conf.set("spark.sql.oap.fiberCache.memory.manager", "pm") + val memoryManager = MemoryManager(sparkEnv) + assert(memoryManager.isInstanceOf[TmpDramMemoryManager]) + } + + test("noevict with hybrid memory manager") { + val sparkEnv = SparkEnv.get + sparkEnv.conf.set("spark.oap.cache.strategy", "noevict") + val memoryManager = MemoryManager(sparkEnv) + assert(memoryManager.isInstanceOf[HybridMemoryManager]) + } + + test("noevict with memory manager set to pm") { + val sparkEnv = SparkEnv.get + sparkEnv.conf.set("spark.oap.cache.strategy", "noevict") + sparkEnv.conf.set("spark.sql.oap.fiberCache.memory.manager", "pm") + val memoryManager = MemoryManager(sparkEnv) + assert(memoryManager.isInstanceOf[HybridMemoryManager]) + } + + test("mix cache with offheap as index memory manager") { + val sparkEnv = SparkEnv.get + sparkEnv.conf.set("spark.oap.cache.strategy", "mix") + sparkEnv.conf.set("spark.sql.oap.fiberCache.memory.manager", "mix") + sparkEnv.conf.set("spark.sql.oap.mix.index.memory.manager", "offheap") + val indexMemoryManager = MemoryManager(sparkEnv, + OapConf.OAP_FIBERCACHE_STRATEGY, FiberType.INDEX) + assert(indexMemoryManager.isInstanceOf[OffHeapMemoryManager]) + } + + test("mix cache with persistent memory as index memory manager") { + val sparkEnv = SparkEnv.get + sparkEnv.conf.set("spark.oap.cache.strategy", "mix") + sparkEnv.conf.set("spark.sql.oap.fiberCache.memory.manager", "mix") + sparkEnv.conf.set("spark.sql.oap.mix.index.memory.manager", "pm") + val indexMemoryManager = MemoryManager(sparkEnv, + OapConf.OAP_FIBERCACHE_STRATEGY, FiberType.INDEX) + assert(indexMemoryManager.isInstanceOf[PersistentMemoryManager]) + } + + test("mix cache with offheap as data memory manager") { + val sparkEnv = SparkEnv.get + sparkEnv.conf.set("spark.oap.cache.strategy", "mix") + sparkEnv.conf.set("spark.sql.oap.fiberCache.memory.manager", "mix") + sparkEnv.conf.set("spark.sql.oap.mix.data.memory.manager", "offheap") + val dataMemoryManager = MemoryManager(sparkEnv, + OapConf.OAP_FIBERCACHE_STRATEGY, FiberType.DATA) + assert(dataMemoryManager.isInstanceOf[OffHeapMemoryManager]) + } + + test("mix cache with pm as data memory manager") { + val sparkEnv = SparkEnv.get + sparkEnv.conf.set("spark.oap.cache.strategy", "mix") + sparkEnv.conf.set("spark.sql.oap.fiberCache.memory.manager", "mix") + sparkEnv.conf.set("spark.sql.oap.mix.data.cache.backend", "guava") + sparkEnv.conf.set("spark.sql.oap.mix.data.memory.manager", "pm") + val dataMemoryManager = MemoryManager(sparkEnv, + OapConf.OAP_FIBERCACHE_STRATEGY, FiberType.DATA) + assert(dataMemoryManager.isInstanceOf[PersistentMemoryManager]) + } + + test("mix cache with separate memory manager using vmem as cache backend") { + val sparkEnv = SparkEnv.get + sparkEnv.conf.set("spark.oap.cache.strategy", "mix") + sparkEnv.conf.set("spark.sql.oap.fiberCache.memory.manager", "mix") + sparkEnv.conf.set("spark.sql.oap.mix.data.cache.backend", "vmem") + val dataMemoryManager = MemoryManager(sparkEnv, + OapConf.OAP_FIBERCACHE_STRATEGY, FiberType.DATA) + assert(dataMemoryManager.isInstanceOf[TmpDramMemoryManager]) + } + + test("mix cache with separate memory manager using guava as cache backend") { + val sparkEnv = SparkEnv.get + sparkEnv.conf.set("spark.oap.cache.strategy", "mix") + sparkEnv.conf.set("spark.sql.oap.fiberCache.memory.manager", "mix") + sparkEnv.conf.set("spark.sql.oap.mix.data.memory.manager", "offheap") + sparkEnv.conf.set("spark.sql.oap.mix.data.cache.backend", "guava") + val dataMemoryManager = MemoryManager(sparkEnv, + OapConf.OAP_FIBERCACHE_STRATEGY, FiberType.DATA) + assert(dataMemoryManager.isInstanceOf[OffHeapMemoryManager]) + } + + test("mix cache with separate memory manager using incorrect guava setting") { + val sparkEnv = SparkEnv.get + sparkEnv.conf.set("spark.oap.cache.strategy", "mix") + sparkEnv.conf.set("spark.sql.oap.fiberCache.memory.manager", "mix") + sparkEnv.conf.set("spark.sql.oap.mix.data.memory.manager", "hybrid") + sparkEnv.conf.set("spark.sql.oap.mix.data.cache.backend", "guava") + assertThrows[UnsupportedOperationException]( + MemoryManager(sparkEnv, OapConf.OAP_FIBERCACHE_STRATEGY, FiberType.DATA)) + } + + test("mix cache with separate memory manager using incorrect vmem setting") { + val sparkEnv = SparkEnv.get + sparkEnv.conf.set("spark.oap.cache.strategy", "mix") + sparkEnv.conf.set("spark.sql.oap.fiberCache.memory.manager", "mix") + sparkEnv.conf.set("spark.sql.oap.mix.data.memory.manager", "hybrid") + sparkEnv.conf.set("spark.sql.oap.mix.data.cache.backend", "vmem") + val dataMemoryManager = MemoryManager(sparkEnv, + OapConf.OAP_FIBERCACHE_STRATEGY, FiberType.DATA) + assert(dataMemoryManager.isInstanceOf[TmpDramMemoryManager]) + } + + test("mix cache with unified memory manager pm") { + val sparkEnv = SparkEnv.get + sparkEnv.conf.set("spark.oap.cache.strategy", "mix") + sparkEnv.conf.set("spark.sql.oap.fiberCache.memory.manager", "pm") + sparkEnv.conf.set("spark.sql.oap.mix.data.cache.backend", "vmem") + val dataMemoryManager = MemoryManager(sparkEnv) + assert(dataMemoryManager.isInstanceOf[PersistentMemoryManager]) + } +} diff --git a/oap-common/.gitignore b/oap-common/.gitignore new file mode 100644 index 000000000..5ed116bff --- /dev/null +++ b/oap-common/.gitignore @@ -0,0 +1,5 @@ +src/resources +target + +**/*.idea +**/*.iml diff --git a/oap-common/README.md b/oap-common/README.md new file mode 100644 index 000000000..3d4b64a4a --- /dev/null +++ b/oap-common/README.md @@ -0,0 +1,16 @@ +# OAP Common + +OAP commoan package includes native libraries and JNI interface for Intel Optane PMem. + +## Prerequisites +Below libraries need to be installed in the machine + +- [Memkind](http://memkind.github.io/memkind/) + +- [Vmemcache](https://github.com/pmem/vmemcache) + +## Building + +``` +mvn clean package -Ppersistent-memory,vmemcache +``` \ No newline at end of file diff --git a/oap-common/pom.xml b/oap-common/pom.xml new file mode 100644 index 000000000..933fe9a09 --- /dev/null +++ b/oap-common/pom.xml @@ -0,0 +1,127 @@ + + + + 4.0.0 + + + com.intel + oap + 0.8.0 + ../pom.xml + + + oap-common + jar + + + + com.google.guava + guava + ${guava.version} + + + org.slf4j + slf4j-api + ${slf4j.version} + + + org.slf4j + slf4j-log4j12 + ${slf4j.version} + + + + + + + org.apache.maven.plugins + maven-compiler-plugin + 3.3 + + ${java.version} + ${java.version} + UTF-8 + 1024m + true + + -Xlint:all,-serial,-path + + + + + + + + + persistent-memory + + + + ${basedir}/src/resources + + + + + exec-maven-plugin + org.codehaus.mojo + ${exec.maven.version} + + + Compile memkind + generate-resources + + exec + + + ${basedir}/src/native/memkind/compile.sh + + + + + + + + + vmemcache + + + + ${basedir}/src/resources + + + + + exec-maven-plugin + org.codehaus.mojo + ${exec.maven.version} + + + Compile vmemcache + generate-resources + + exec + + + ${basedir}/src/native/vmemcache/compile.sh + + + + + + + + + + diff --git a/oap-cache/oap/src/main/java/org/apache/spark/unsafe/PersistentMemoryPlatform.java b/oap-common/src/main/java/com/intel/oap/common/unsafe/PersistentMemoryPlatform.java similarity index 61% rename from oap-cache/oap/src/main/java/org/apache/spark/unsafe/PersistentMemoryPlatform.java rename to oap-common/src/main/java/com/intel/oap/common/unsafe/PersistentMemoryPlatform.java index d0a75e91c..7bfd38630 100644 --- a/oap-cache/oap/src/main/java/org/apache/spark/unsafe/PersistentMemoryPlatform.java +++ b/oap-common/src/main/java/com/intel/oap/common/unsafe/PersistentMemoryPlatform.java @@ -15,13 +15,17 @@ * limitations under the License. */ -package org.apache.spark.unsafe; - -import java.io.File; +package com.intel.oap.common.unsafe; import com.google.common.base.Preconditions; +import com.intel.oap.common.util.NativeLibraryLoader; + +import java.io.File; +import java.lang.reflect.Constructor; +import java.lang.reflect.Field; +import java.nio.ByteBuffer; -import org.apache.spark.util.NativeLibraryLoader; +import sun.misc.Cleaner; /** * A platform used to allocate/free volatile memory from @@ -29,9 +33,9 @@ * e.g. Intel Optane DC persistent memory. */ public class PersistentMemoryPlatform { + private static volatile boolean initialized = false; private static final String LIBNAME = "pmplatform"; - static { NativeLibraryLoader.load(LIBNAME); } @@ -61,8 +65,31 @@ public static void initialize(String path, long size, int pattern) { } } + /** + * Initialize the persistent memory with dax kmem node. + */ + public static void initialize() { + synchronized (PersistentMemoryPlatform.class) { + if (!initialized) { + initializeKmem(); + initialized = true; + } + } + } + + private static native void initializeKmem(); + private static native void initializeNative(String path, long size, int pattern); + /** + * For DAX KMEM usage only + * @param daxNodeId the numa node created from persistent memory. + * memkind will set it as MEMKIND_DAX_KMEM_NODES env. + * by using MEMKIND_DAX_KMEM_NODES, memkind will recognize this node + * @param regularNodeId the numa node from dram + */ + public static native void setNUMANode(String daxNodeId, String regularNodeId); + /** * Allocate volatile memory from persistent memory. * @param size the requested size @@ -71,6 +98,34 @@ public static void initialize(String path, long size, int pattern) { */ public static native long allocateVolatileMemory(long size); + /** + * Allocate direct buffer from persistent memory. + * @param size the requested size + * @return the byte buffer which same as Platform.allocateDirectBuffer, it can be operated by + * Platform which same as OFF_HEAP memory. + */ + public static ByteBuffer allocateVolatileDirectBuffer(int size) { + try { + Class cls = Class.forName("java.nio.DirectByteBuffer"); + Constructor constructor = cls.getDeclaredConstructor(Long.TYPE, Integer.TYPE); + constructor.setAccessible(true); + Field cleanerField = cls.getDeclaredField("cleaner"); + cleanerField.setAccessible(true); + final long memory = allocateVolatileMemory(size); + ByteBuffer buffer = (ByteBuffer) constructor.newInstance(memory, size); + Cleaner cleaner = Cleaner.create(buffer, new Runnable() { + @Override + public void run() { + freeMemory(memory); + } + }); + cleanerField.set(buffer, cleaner); + return buffer; + } catch (Exception e) { + throw new RuntimeException(e); + } + } + /** * Get the actual occupied size of the given address. The occupied size should be different * with the requested size because of the memory management of Intel Optane DC persistent diff --git a/oap-cache/oap/src/main/java/org/apache/spark/unsafe/VMEMCacheJNI.java b/oap-common/src/main/java/com/intel/oap/common/unsafe/VMEMCacheJNI.java similarity index 94% rename from oap-cache/oap/src/main/java/org/apache/spark/unsafe/VMEMCacheJNI.java rename to oap-common/src/main/java/com/intel/oap/common/unsafe/VMEMCacheJNI.java index c5e82e4c4..e9f195acc 100644 --- a/oap-cache/oap/src/main/java/org/apache/spark/unsafe/VMEMCacheJNI.java +++ b/oap-common/src/main/java/com/intel/oap/common/unsafe/VMEMCacheJNI.java @@ -1,4 +1,6 @@ -package org.apache.spark.unsafe; +package com.intel.oap.common.unsafe; + +import com.intel.oap.common.util.NativeLibraryLoader; import java.nio.ByteBuffer; @@ -17,7 +19,7 @@ public class VMEMCacheJNI { static { LOG.info("Trying to load the native library from jni..."); - NativeLoader.loadLibrary(LIBRARY_NAME); + NativeLibraryLoader.load(LIBRARY_NAME); } public static synchronized int initialize(String path, long maxSize) { diff --git a/oap-cache/oap/src/main/java/org/apache/spark/util/NativeLibraryLoader.java b/oap-common/src/main/java/com/intel/oap/common/util/NativeLibraryLoader.java similarity index 99% rename from oap-cache/oap/src/main/java/org/apache/spark/util/NativeLibraryLoader.java rename to oap-common/src/main/java/com/intel/oap/common/util/NativeLibraryLoader.java index 9c392ceec..ff5598202 100644 --- a/oap-cache/oap/src/main/java/org/apache/spark/util/NativeLibraryLoader.java +++ b/oap-common/src/main/java/com/intel/oap/common/util/NativeLibraryLoader.java @@ -15,7 +15,7 @@ * limitations under the License. */ -package org.apache.spark.util; +package com.intel.oap.common.util; import java.io.*; import java.util.HashMap; diff --git a/oap-cache/oap/src/main/native/CMakeLists.txt b/oap-common/src/native/memkind/CMakeLists.txt similarity index 93% rename from oap-cache/oap/src/main/native/CMakeLists.txt rename to oap-common/src/native/memkind/CMakeLists.txt index d915855c5..3671f52e1 100644 --- a/oap-cache/oap/src/main/native/CMakeLists.txt +++ b/oap-common/src/native/memkind/CMakeLists.txt @@ -27,7 +27,7 @@ INCLUDE_DIRECTORIES(${JNI_INCLUDE_DIRS}) SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11") -SET(SOURCE_FILES org_apache_spark_unsafe_PersistentMemoryPlatform.cpp) +SET(SOURCE_FILES com_intel_oap_common_unsafe_PersistentMemoryPlatform.cpp) ADD_LIBRARY(pmplatform SHARED ${SOURCE_FILES}) diff --git a/oap-cache/oap/src/main/native/org_apache_spark_unsafe_PersistentMemoryPlatform.cpp b/oap-common/src/native/memkind/com_intel_oap_common_unsafe_PersistentMemoryPlatform.cpp similarity index 70% rename from oap-cache/oap/src/main/native/org_apache_spark_unsafe_PersistentMemoryPlatform.cpp rename to oap-common/src/native/memkind/com_intel_oap_common_unsafe_PersistentMemoryPlatform.cpp index 8ae9371bb..f7ba8cf56 100644 --- a/oap-cache/oap/src/main/native/org_apache_spark_unsafe_PersistentMemoryPlatform.cpp +++ b/oap-common/src/native/memkind/com_intel_oap_common_unsafe_PersistentMemoryPlatform.cpp @@ -16,13 +16,13 @@ */ #include -#include +#include #include #include #include #include #include -#include "org_apache_spark_unsafe_PersistentMemoryPlatform.h" +#include "com_intel_oap_common_unsafe_PersistentMemoryPlatform.h" using memkind = struct memkind; memkind *pmemkind = NULL; @@ -50,7 +50,12 @@ inline void check(JNIEnv *env) { } } -JNIEXPORT void JNICALL Java_org_apache_spark_unsafe_PersistentMemoryPlatform_initializeNative +JNIEXPORT void JNICALL Java_com_intel_oap_common_unsafe_PersistentMemoryPlatform_initializeKmem + (JNIEnv *, jclass) { + pmemkind = MEMKIND_DAX_KMEM; +} + +JNIEXPORT void JNICALL Java_com_intel_oap_common_unsafe_PersistentMemoryPlatform_initializeNative (JNIEnv *env, jclass clazz, jstring path, jlong size, jint pattern) { // str should not be null, we should checked in java code const char* str = env->GetStringUTFChars(path, NULL); @@ -78,7 +83,18 @@ JNIEXPORT void JNICALL Java_org_apache_spark_unsafe_PersistentMemoryPlatform_ini env->ReleaseStringUTFChars(path, str); } -JNIEXPORT jlong JNICALL Java_org_apache_spark_unsafe_PersistentMemoryPlatform_allocateVolatileMemory +JNIEXPORT void JNICALL Java_com_intel_oap_common_unsafe_PersistentMemoryPlatform_setNUMANode + (JNIEnv *env, jclass, jstring dax_node, jstring regular_node) { + const char* dax_node_str = env->GetStringUTFChars(dax_node, NULL); + const char* regular_node_str = env->GetStringUTFChars(regular_node, NULL); + + setenv("MEMKIND_REGULAR_NODES", regular_node_str, 1); + setenv("MEMKIND_DAX_KMEM_NODES", dax_node_str, 1); + env->ReleaseStringUTFChars(regular_node, regular_node_str); + env->ReleaseStringUTFChars(dax_node, dax_node_str); +} + +JNIEXPORT jlong JNICALL Java_com_intel_oap_common_unsafe_PersistentMemoryPlatform_allocateVolatileMemory (JNIEnv *env, jclass clazz, jlong size) { check(env); @@ -96,15 +112,23 @@ JNIEXPORT jlong JNICALL Java_org_apache_spark_unsafe_PersistentMemoryPlatform_al return addr_to_java(p); } -JNIEXPORT jlong JNICALL Java_org_apache_spark_unsafe_PersistentMemoryPlatform_getOccupiedSize +JNIEXPORT jlong JNICALL Java_com_intel_oap_common_unsafe_PersistentMemoryPlatform_getOccupiedSize (JNIEnv *env, jclass clazz, jlong address) { check(env); void *p = addr_from_java(address); return memkind_malloc_usable_size(pmemkind, p); } -JNIEXPORT void JNICALL Java_org_apache_spark_unsafe_PersistentMemoryPlatform_freeMemory +JNIEXPORT void JNICALL Java_com_intel_oap_common_unsafe_PersistentMemoryPlatform_freeMemory (JNIEnv *env, jclass clazz, jlong address) { check(env); memkind_free(pmemkind, addr_from_java(address)); } + +JNIEXPORT void JNICALL Java_com_intel_oap_common_unsafe_PersistentMemoryPlatform_copyMemory + (JNIEnv *env, jclass clazz, jlong destination, jlong source, jlong size) { + size_t sz = (size_t)size; + void *dest = addr_from_java(destination); + void *src = addr_from_java(source); + std::memcpy(dest, src, sz); +} diff --git a/oap-common/src/native/memkind/com_intel_oap_common_unsafe_PersistentMemoryPlatform.h b/oap-common/src/native/memkind/com_intel_oap_common_unsafe_PersistentMemoryPlatform.h new file mode 100644 index 000000000..be410fe5c --- /dev/null +++ b/oap-common/src/native/memkind/com_intel_oap_common_unsafe_PersistentMemoryPlatform.h @@ -0,0 +1,87 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* DO NOT EDIT THIS FILE - it is machine generated */ +#include +/* Header for class com_intel_oap_common_unsafe_PersistentMemoryPlatform */ + +#ifndef _Included_com_intel_oap_common_unsafe_PersistentMemoryPlatform +#define _Included_com_intel_oap_common_unsafe_PersistentMemoryPlatform +#ifdef __cplusplus +extern "C" { +#endif + +/* + * Class: com_intel_oap_common_unsafe_PersistentMemoryPlatform + * Method: initializeNative + * Signature: (J)V + */ +JNIEXPORT void JNICALL Java_com_intel_oap_common_unsafe_PersistentMemoryPlatform_initializeKmem + (JNIEnv *, jclass); + +/* + * Class: com_intel_oap_common_unsafe_PersistentMemoryPlatform + * Method: initializeNative + * Signature: (Ljava/lang/String;J)V + */ +JNIEXPORT void JNICALL Java_com_intel_oap_common_unsafe_PersistentMemoryPlatform_initializeNative + (JNIEnv *, jclass, jstring, jlong, jint); + +/* + * Class: com_intel_oap_common_unsafe_PersistentMemoryPlatform + * Method: setNUMANode + * Signature: (Ljava/lang/String;)V + */ +JNIEXPORT void JNICALL Java_com_intel_oap_common_unsafe_PersistentMemoryPlatform_setNUMANode + (JNIEnv *, jclass, jstring, jstring); + +/* + * Class: com_intel_oap_common_unsafe_PersistentMemoryPlatform + * Method: allocateMemory + * Signature: (J)J + */ +JNIEXPORT jlong JNICALL Java_com_intel_oap_common_unsafe_PersistentMemoryPlatform_allocateVolatileMemory + (JNIEnv *, jclass, jlong); + +/* + * Class: com_intel_oap_common_unsafe_PersistentMemoryPlatform + * Method: getOccupiedSize + * Signature: (J)J + */ +JNIEXPORT jlong JNICALL Java_com_intel_oap_common_unsafe_PersistentMemoryPlatform_getOccupiedSize + (JNIEnv *, jclass, jlong); + +/* + * Class: com_intel_oap_common_unsafe_PersistentMemoryPlatform + * Method: freeMemory + * Signature: (J)V + */ +JNIEXPORT void JNICALL Java_com_intel_oap_common_unsafe_PersistentMemoryPlatform_freeMemory + (JNIEnv *, jclass, jlong); + +/* + * Class: com_intel_oap_common_unsafe_PersistentMemoryPlatform + * Method: copyMemory + * Signature: (JJJ)V + */ +JNIEXPORT void JNICALL Java_com_intel_oap_common_unsafe_PersistentMemoryPlatform_copyMemory + (JNIEnv *, jclass, jlong, jlong, jlong); + +#ifdef __cplusplus +} +#endif +#endif diff --git a/oap-cache/oap/src/main/native/compile.sh b/oap-common/src/native/memkind/compile.sh similarity index 89% rename from oap-cache/oap/src/main/native/compile.sh rename to oap-common/src/native/memkind/compile.sh index 8f9b9ec82..74c4c4643 100755 --- a/oap-cache/oap/src/main/native/compile.sh +++ b/oap-common/src/native/memkind/compile.sh @@ -48,8 +48,9 @@ case ${ARCH} in ;; esac -CURRENT_DIR="$(dirname "$0")" -RESOURCES_DIR=${CURRENT_DIR}/../resources/${OS}/${ARCH} +CURRENT_DIR=$(cd "$(dirname "$BASH_SOURCE")"; pwd) +RESOURCES_DIR=${CURRENT_DIR}/../../resources/${OS}/${ARCH} +echo $RESOURCES_DIR if [ ! -d ${RESOURCES_DIR} ]; then mkdir -p ${RESOURCES_DIR} @@ -65,4 +66,7 @@ cmake -DCMAKE_INSTALL_PREFIX=${RESOURCES_DIR} ../ make make install +cd $CURRENT_DIR +rm -r build + set +eu diff --git a/oap-cache/oap/src/main/native/vmemcache/Makefile b/oap-common/src/native/vmemcache/Makefile similarity index 97% rename from oap-cache/oap/src/main/native/vmemcache/Makefile rename to oap-common/src/native/vmemcache/Makefile index b240c2017..7fdb250f7 100644 --- a/oap-cache/oap/src/main/native/vmemcache/Makefile +++ b/oap-common/src/native/vmemcache/Makefile @@ -65,7 +65,7 @@ $(TARGET) : $(SRCS) $(foreach D,$(LIB_DIRS),-L$D) \ $(foreach L,$(LIBS),-l$L) \ -o $@ - $(ECHO)cp $(TARGET) ../../resources/lib/linux64 + $(ECHO)cp $(TARGET) ../../resources/linux/64/lib clean: $(ECHO)rm -rf *.so *.o diff --git a/oap-common/src/native/vmemcache/compile.sh b/oap-common/src/native/vmemcache/compile.sh new file mode 100755 index 000000000..a0803219a --- /dev/null +++ b/oap-common/src/native/vmemcache/compile.sh @@ -0,0 +1,63 @@ +#!/usr/bin/env bash + +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +set -eu +# detect OS +OS="`uname -s`" +case ${OS} in + 'Linux' ) + OS='linux' + ;; + 'Darwin') + OS='mac' + ;; + *) + echo "The platform: ${OS} is not supported." + exit -1 + ;; +esac + +# detect Arch +ARCH="`uname -m`" +case ${ARCH} in + "x86_64") + ARCH="64" + ;; + "i686") + ARCH="32" + ;; + *) + echo "The arch: ${ARCH} is not supported." + exit -2 + ;; +esac + +CURRENT_DIR=$(cd "$(dirname "$BASH_SOURCE")"; pwd) +RESOURCES_DIR=${CURRENT_DIR}/../../resources/${OS}/${ARCH} +echo $RESOURCES_DIR + +if [ ! -d ${RESOURCES_DIR}/lib ]; then + mkdir -p ${RESOURCES_DIR}/lib +fi + +cd ${CURRENT_DIR} + +make && make clean + +set +eu diff --git a/oap-cache/oap/src/main/native/vmemcache/vmemcachejni.c b/oap-common/src/native/vmemcache/vmemcachejni.c similarity index 92% rename from oap-cache/oap/src/main/native/vmemcache/vmemcachejni.c rename to oap-common/src/native/vmemcache/vmemcachejni.c index 2d67df350..c7723fc04 100644 --- a/oap-cache/oap/src/main/native/vmemcache/vmemcachejni.c +++ b/oap-common/src/native/vmemcache/vmemcachejni.c @@ -54,13 +54,13 @@ static void check(JNIEnv *env) } } -/*org.apache.spark.unsafe - * Class: com_intel_dcpmcache_vmemcache_VMEMCacheJNI +/*com.intel.oap.common.unsafe + * Class: com_intel_oap_common_unsafe_VMEMCacheJNI * Method: init * Signature: (Ljava/lang/String;J)I */ JNIEXPORT jint JNICALL -Java_org_apache_spark_unsafe_VMEMCacheJNI_init( +Java_com_intel_oap_common_unsafe_VMEMCacheJNI_init( JNIEnv *env, jclass cls, jstring path, jlong maxSize) { const char* pathString = (*env)->GetStringUTFChars(env, path, NULL); @@ -86,12 +86,12 @@ Java_org_apache_spark_unsafe_VMEMCacheJNI_init( } /* - * Class: com_intel_dcpmcache_vmemcache_VMEMCacheJNI + * Class: com_intel_oap_common_unsafe_VMEMCacheJNI * Method: putNative * Signature: ([BLjava/nio/ByteBuffer;II[BLjava/nio/ByteBuffer;II)I */ JNIEXPORT jint JNICALL -Java_org_apache_spark_unsafe_VMEMCacheJNI_putNative( +Java_com_intel_oap_common_unsafe_VMEMCacheJNI_putNative( JNIEnv *env, jclass cls, jbyteArray keyArray, jobject keyBuffer, jint keyOff, jint keyLen, jlong valueBaseAddr, jint valueOff, jint valueLen) { @@ -132,12 +132,12 @@ Java_org_apache_spark_unsafe_VMEMCacheJNI_putNative( } /* - * Class: com_intel_dcpmcache_vmemcache_VMEMCacheJNI + * Class: com_intel_oap_common_unsafe_VMEMCacheJNI * Method: put * Signature: ([BLjava/nio/ByteBuffer;II[BLjava/nio/ByteBuffer;II)I */ JNIEXPORT jint JNICALL -Java_org_apache_spark_unsafe_VMEMCacheJNI_put( +Java_com_intel_oap_common_unsafe_VMEMCacheJNI_put( JNIEnv *env, jclass cls, jbyteArray keyArray, jobject keyBuffer, jint keyOff, jint keyLen, jbyteArray valueArray, jobject valueBuffer, jint valueOff, jint valueLen) { @@ -188,12 +188,12 @@ Java_org_apache_spark_unsafe_VMEMCacheJNI_put( } /* - * Class: com_intel_dcpmcache_vmemcache_VMEMCacheJNI + * Class: com_intel_oap_common_unsafe_VMEMCacheJNI * Method: get * Signature: ([BLjava/nio/ByteBuffer;II[BLjava/nio/ByteBuffer;II)I */ JNIEXPORT jint JNICALL -Java_org_apache_spark_unsafe_VMEMCacheJNI_get( +Java_com_intel_oap_common_unsafe_VMEMCacheJNI_get( JNIEnv *env, jclass cls, jbyteArray keyArray, jobject keyBuffer, jint keyOff, jint keyLen, jbyteArray valueArray, jobject valueBuffer, jint valueOff, jint maxValueLen) { @@ -244,12 +244,12 @@ Java_org_apache_spark_unsafe_VMEMCacheJNI_get( } /* - * Class: com_intel_dcpmcache_vmemcache_VMEMCacheJNI + * Class: com_intel_oap_common_unsafe_VMEMCacheJNI * Method: getNative * Signature: ([BLjava/nio/ByteBuffer;II[BLjava/nio/ByteBuffer;II)I */ JNIEXPORT jint JNICALL -Java_org_apache_spark_unsafe_VMEMCacheJNI_getNative( +Java_com_intel_oap_common_unsafe_VMEMCacheJNI_getNative( JNIEnv *env, jclass cls, jbyteArray keyArray, jobject keyBuffer, jint keyOff, jint keyLen, jlong valueBaseObj, jint valueOff, jint maxValueLen) { @@ -290,12 +290,12 @@ Java_org_apache_spark_unsafe_VMEMCacheJNI_getNative( } /* - * Class: com_intel_dcpmcache_vmemcache_VMEMCacheJNI + * Class: com_intel_oap_common_unsafe_VMEMCacheJNI * Method: evict * Signature: ([BLjava/nio/ByteBuffer;II)I */ JNIEXPORT jint JNICALL -Java_org_apache_spark_unsafe_VMEMCacheJNI_evict( +Java_com_intel_oap_common_unsafe_VMEMCacheJNI_evict( JNIEnv *env, jclass cls, jbyteArray keyArray, jobject keyBuffer, jint keyOff, jint keyLen) { const char* key; @@ -328,12 +328,12 @@ Java_org_apache_spark_unsafe_VMEMCacheJNI_evict( } /* - * Class: com_intel_dcpmcache_vmemcache_VMEMCacheJNI + * Class: com_intel_oap_common_unsafe_VMEMCacheJNI * Method: exist * Signature: ([BLjava/nio/ByteBuffer;II[BLjava/nio/ByteBuffer;II)I */ JNIEXPORT jint JNICALL -Java_org_apache_spark_unsafe_VMEMCacheJNI_exist( +Java_com_intel_oap_common_unsafe_VMEMCacheJNI_exist( JNIEnv *env, jclass cls, jbyteArray keyArray, jobject keyBuffer, jint keyOff, jint keyLen) { const char* key; @@ -362,12 +362,12 @@ Java_org_apache_spark_unsafe_VMEMCacheJNI_exist( } /* - * Class: com_intel_dcpmcache_vmemcache_VMEMCacheJNI + * Class: com_intel_oap_common_unsafe_VMEMCacheJNI * Method: status * Signature: ([BLjava/nio/ByteBuffer;II[BLjava/nio/ByteBuffer;II)I */ JNIEXPORT jint JNICALL -Java_org_apache_spark_unsafe_VMEMCacheJNI_status( +Java_com_intel_oap_common_unsafe_VMEMCacheJNI_status( JNIEnv *env, jclass cls, jlongArray statusArray) { stat_t stat; diff --git a/pom.xml b/pom.xml new file mode 100644 index 000000000..95046b214 --- /dev/null +++ b/pom.xml @@ -0,0 +1,45 @@ + + + + 4.0.0 + + com.intel + oap + 0.8.0 + pom + + OAP + https://github.com/Intel-bigdata/OAP + + + + The Apache Software License, Version 2.0 + http://www.apache.org/licenses/LICENSE-2.0.txt + + + + + 1.6.0 + 18.0 + 1.8 + 1.7.16 + + + + oap-common + oap-cache/oap + + +