From 92c1a3d7e79c5bb33992db29177aed7da5f4c24a Mon Sep 17 00:00:00 2001 From: "Mark A. Olson" Date: Fri, 7 Jan 2022 19:36:15 -0800 Subject: [PATCH 1/5] DAOS-9507 Modifications for DAOS v2.0 Changed image build scripts to install and configure DAOS v2.0. Image family name now includes OS. (Ex. daos-server-centos7) Changed IO500 install script to install IO500 SC21. Updated terraform/examples/io500 to use new os image family names and run IO500 SC21. Refactored env variable names to be more obvious about what they are used for. Signed-off-by: Mark A. Olson --- images/configs/daos_server.yml | 21 ++- images/daos-client-image.json | 14 +- images/daos-server-image.json | 11 +- images/make_images.sh | 97 ++++++---- ...ild.yaml => packer_cloudbuild-server.yaml} | 0 images/scripts/install-client.sh | 88 +++++---- images/scripts/install-server.sh | 69 +++++++ images/scripts/install.sh | 47 ----- images/scripts/io500-sc21_install.sh | 148 +++++++++++++++ images/scripts/io500_install.sh | 114 ------------ images/scripts/mfu_install.sh | 92 ---------- images/scripts/mpifileutils_install.sh | 170 ++++++++++++++++++ images/scripts/{setup.sh => setup-server.sh} | 2 +- images/scripts/tune.sh | 2 +- terraform/examples/io500/clean.sh | 12 +- terraform/examples/io500/configure.sh | 71 +++++--- terraform/examples/io500/run_io500-sc21.sh | 138 ++++++++++++++ terraform/examples/io500/setup_io500.sh | 93 ---------- terraform/examples/io500/start.sh | 62 ++++--- terraform/examples/io500/stop.sh | 8 +- 20 files changed, 766 insertions(+), 493 deletions(-) rename images/{packer_cloudbuild.yaml => packer_cloudbuild-server.yaml} (100%) create mode 100644 images/scripts/install-server.sh delete mode 100644 images/scripts/install.sh create mode 100644 images/scripts/io500-sc21_install.sh delete mode 100644 images/scripts/io500_install.sh delete mode 100644 images/scripts/mfu_install.sh create mode 100644 images/scripts/mpifileutils_install.sh rename images/scripts/{setup.sh => setup-server.sh} (96%) create mode 100755 terraform/examples/io500/run_io500-sc21.sh delete mode 100755 terraform/examples/io500/setup_io500.sh diff --git a/images/configs/daos_server.yml b/images/configs/daos_server.yml index 705deca..b9a1955 100644 --- a/images/configs/daos_server.yml +++ b/images/configs/daos_server.yml @@ -3,24 +3,29 @@ transport_config: allow_insecure: true provider: ofi+tcp;ofi_rxm disable_vfio: true -crt_timeout: 200 +crt_timeout: 300 +nr_hugepages: 4096 +control_log_file: /var/daos/daos_server.log engines: - targets: 8 nr_xs_helpers: 0 first_core: 1 - fabric_iface_port: 31316 + bypass_health_chk: true fabric_iface: eth0 + fabric_iface_port: 31316 log_mask: ERR log_file: /var/daos/engine.log env_vars: - FI_OFI_RXM_DEF_TCP_WAIT_OBJ=pollfd - scm_mount: /var/daos/ram - scm_class: ram - scm_size: 100 - - bdev_class: nvme - bdev_list: ["0000:00:04.0"] + storage: + - + scm_mount: /var/daos/ram + class: ram + scm_size: 100 + - + class: nvme + bdev_list: ["0000:00:04.0"] diff --git a/images/daos-client-image.json b/images/daos-client-image.json index b6fa41d..bfa2958 100644 --- a/images/daos-client-image.json +++ b/images/daos-client-image.json @@ -1,19 +1,19 @@ { "variables": { - "DAOS_VERSION": "1.2.0", - "IO500_INSTALL_DIR": "/usr/local" + "DAOS_VERSION": "2.0.0" }, "builders": [ { "type": "googlecompute", + "name": "daos-client-centos7", "account_file": "{{user `account_file`}}", "project_id": "{{user `project_id`}}", "source_image_project_id": [ "cloud-hpc-image-public" ], "source_image_family": "hpc-centos-7", - "image_name": "daos-client-v{{isotime \"20060102-030405\"}}", - "image_family": "daos-client", + "image_name": "daos-client-centos7-v{{isotime \"20060102-030405\"}}", + "image_family": "daos-client-centos7", "machine_type": "n1-standard-16", "disk_size": "20", "ssh_username": "packer", @@ -41,14 +41,12 @@ { "type": "shell", "execute_command": "echo 'packer' | sudo -S env {{ .Vars }} {{ .Path }}", - "environment_vars": "IO500_INSTALL_DIR={{user `IO500_INSTALL_DIR`}}", - "script": "./scripts/mfu_install.sh" + "script": "./scripts/mpifileutils_install.sh" }, { "type": "shell", "execute_command": "echo 'packer' | sudo -S env {{ .Vars }} {{ .Path }}", - "environment_vars": "IO500_INSTALL_DIR={{user `IO500_INSTALL_DIR`}}", - "script": "./scripts/io500_install.sh" + "script": "./scripts/io500-sc21_install.sh" } ] } diff --git a/images/daos-server-image.json b/images/daos-server-image.json index 5442343..101ba3e 100644 --- a/images/daos-server-image.json +++ b/images/daos-server-image.json @@ -1,18 +1,19 @@ { "variables": { - "DAOS_VERSION": "1.2.0" + "DAOS_VERSION": "2.0.0" }, "builders": [ { "type": "googlecompute", + "name": "daos-server-centos7", "account_file": "{{user `account_file`}}", "project_id": "{{user `project_id`}}", "source_image_project_id": [ "centos-cloud" ], "source_image_family": "centos-7", - "image_name": "daos-server-v{{isotime \"20060102-030405\"}}", - "image_family": "daos-server", + "image_name": "daos-server-centos7-v{{isotime \"20060102-030405\"}}", + "image_family": "daos-server-centos7", "machine_type": "n1-standard-16", "disk_size": "20", "ssh_username": "packer", @@ -35,7 +36,7 @@ "type": "shell", "execute_command": "echo 'packer' | sudo -S env {{ .Vars }} {{ .Path }}", "environment_vars": "DAOS_VERSION={{user `DAOS_VERSION`}}", - "script": "./scripts/install.sh" + "script": "./scripts/install-server.sh" }, { "type": "file", @@ -51,7 +52,7 @@ "type": "shell", "execute_command": "echo 'packer' | sudo -S env {{ .Vars }} {{ .Path }}", "environment_vars": "DAOS_VERSION={{user `DAOS_VERSION`}}", - "script": "./scripts/setup.sh" + "script": "./scripts/setup-server.sh" } ] } diff --git a/images/make_images.sh b/images/make_images.sh index f11ec3d..0ff9c8c 100755 --- a/images/make_images.sh +++ b/images/make_images.sh @@ -5,56 +5,91 @@ # without warranty or representation for any use or purpose. # Your use of it is subject to your agreements with Google. +# +# To build both DAOS client and server images: +# ./make_images.sh +# +# To build DAOS client images only: +# ./make_images.sh client +# +# To build DAOS server images only: +# ./make_images.sh server +# + +set -e +trap 'echo "Unexpected and unchecked error. Exiting."' ERR + +# Set environment variable defaults if not already set +: "${IMAGE_TYPE:=all}" + +if [[ ! -z $1 ]]; then + IMAGE_TYPE=$(echo $1 | tr '[A-Z]' '[a-z]') + if [[ ! $IMAGE_TYPE =~ ^(all|server|client)$ ]]; then + echo "Invalid value passed for first arg." + echo "Valid values are 'all', 'client', 'server'" + exit 1 + fi +fi + PROJECT=$(gcloud info --format="value(config.project)") -fwrulename=gcp-cloudbuild-ssh +FWRULENAME=gcp-cloudbuild-ssh # The service account used here should have been already created #by the "packer_build" step. We are just checking here. -CLOUD_BUILD_ACCOUNT=$(gcloud projects get-iam-policy $PROJECT \ +CLOUD_BUILD_ACCOUNT=$(gcloud projects get-iam-policy "${PROJECT}" \ --filter="(bindings.role:roles/cloudbuild.builds.builder)" \ ---flatten="bindings[].members" --format="value(bindings.members[])") +--flatten="bindings[].members" \ +--format="value(bindings.members[])" \ +--limit=1) echo "Packer will be using service account ${CLOUD_BUILD_ACCOUNT}" + # Add cloudbuild SA permissions -gcloud projects add-iam-policy-binding $PROJECT \ - --member $CLOUD_BUILD_ACCOUNT \ +gcloud projects add-iam-policy-binding "${PROJECT}" \ + --member "${CLOUD_BUILD_ACCOUNT}" \ --role roles/compute.instanceAdmin.v1 -gcloud projects add-iam-policy-binding $PROJECT \ - --member $CLOUD_BUILD_ACCOUNT \ +gcloud projects add-iam-policy-binding "${PROJECT}" \ + --member "${CLOUD_BUILD_ACCOUNT}" \ --role roles/iam.serviceAccountUser -# check if we have an ssh firewall rule for cloudbuild in place already -fwlist=$(gcloud compute --project=${PROJECT} firewall-rules list --filter name=${fwrulename} \ +# Check if we have an ssh firewall rule for cloudbuild in place already +FWLIST=$(gcloud compute --project="${PROJECT}" \ + firewall-rules list \ + --filter name="${FWRULENAME}" \ --sort-by priority \ --format='value(name)') -if [ -z $fwlist ] ; - then - #setup firewall rule to allow ssh from clould build. - #FIXME: Needs to be fixed to restric to IP range - #for clound build only once we know what that is. - echo "setting up firewall rule for ssh and clouldbuild." - gcloud compute --project=${PROJECT} firewall-rules create ${fwrulename} \ - --direction=INGRESS --priority=1000 --network=default --action=ALLOW \ - --rules=tcp:22 --source-ranges=0.0.0.0/0 - else - echo "Firewall rule for ssh and cloud build already in place. " +if [[ -z $FWLIST ]]; then + # Setup firewall rule to allow ssh from clould build. + # FIXME: Needs to be fixed to restric to IP range + # for clound build only once we know what that is. + echo "Setting up firewall rule for ssh and clouldbuild" + gcloud compute --project="${PROJECT}" firewall-rules create "${FWRULENAME}" \ + --direction=INGRESS --priority=1000 --network=default --action=ALLOW \ + --rules=tcp:22 --source-ranges=0.0.0.0/0 +else + echo "Firewall rule for ssh and cloud build already in place. " fi -#build image. We need to make sure we don't time out so we increase to 1hr. -gcloud builds submit --timeout=1800s \ - --substitutions=_PROJECT_ID=${PROJECT} \ - --config=packer_cloudbuild.yaml . - - -gcloud builds submit --timeout=1800s \ - --substitutions=_PROJECT_ID=${PROJECT} \ - --config=packer_cloudbuild-client.yaml . +# Build images. +# Increase timeout to 1hr to make sure we don't time out +if [[ $IMAGE_TYPE =~ ^(all|server)$ ]]; then + printf "\nBuilding server image(s)\n\n" + gcloud builds submit --timeout=1800s \ + --substitutions=_PROJECT_ID="${PROJECT}" \ + --config=packer_cloudbuild-server.yaml . +fi -# remove ssh firewall -gcloud -q compute --project=${PROJECT} firewall-rules delete ${fwrulename} +if [[ $IMAGE_TYPE =~ ^(all|client)$ ]]; then + printf "\nBuilding client image(s)\n\n" + gcloud builds submit --timeout=1800s \ + --substitutions=_PROJECT_ID="${PROJECT}" \ + --config=packer_cloudbuild-client.yaml . +fi +# Remove ssh firewall +gcloud -q compute --project="${PROJECT}" firewall-rules delete "${FWRULENAME}" diff --git a/images/packer_cloudbuild.yaml b/images/packer_cloudbuild-server.yaml similarity index 100% rename from images/packer_cloudbuild.yaml rename to images/packer_cloudbuild-server.yaml diff --git a/images/scripts/install-client.sh b/images/scripts/install-client.sh index 82ea9d8..3f06fdf 100644 --- a/images/scripts/install-client.sh +++ b/images/scripts/install-client.sh @@ -1,42 +1,35 @@ #!/bin/bash -# Copyright 2021 Google LLC # -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at +# Install Intel OneAPI and the DAOS Client # -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. +set -e +trap 'echo "An unexpected error occurred. Exiting."' ERR -echo "Installing DAOS version ${DAOS_VERSION}" +# DAOS_VERSION must be set before running this script +if [[ -z $DAOS_VERSION ]]; then + echo "DAOS_VERSION not set. Exiting." + exit 1 +fi -# Install 1.2.0 RPMs from official site -tee /etc/yum.repos.d/daos.repo > /dev/null < /etc/yum.repos.d/oneAPI.repo << EOF +log "Installing Intel oneAPI MPI" + +# Install Intel MPI from Intel oneAPI package +cat > /etc/yum.repos.d/oneAPI.repo < /etc/yum.repos.d/daos.repo < /etc/yum.repos.d/daos.repo < /dev/null < io500_prepare.patch <<'EOF' +diff --git a/prepare.sh b/prepare.sh +index f793dfe..d4cb7e8 100755 +--- a/prepare.sh ++++ b/prepare.sh +@@ -8,7 +8,7 @@ echo It will output OK at the end if builds succeed + echo + + IOR_HASH=14deedfec48ce295dff683d15c1b194652bd6d08 +-PFIND_HASH=62c3a7e31 ++PFIND_HASH=mfu_integration + + INSTALL_DIR=$PWD + BIN=$INSTALL_DIR/bin +@@ -59,7 +59,7 @@ function get_ior { + + function get_pfind { + echo "Preparing parallel find" +- git_co https://github.com/VI4IO/pfind.git pfind $PFIND_HASH ++ git_co https://github.com/mchaarawi/pfind pfind $PFIND_HASH + } + + function get_schema_tools { +@@ -73,7 +73,7 @@ function build_ior { + pushd $BUILD/ior + ./bootstrap + # Add here extra flags +- ./configure --prefix=$INSTALL_DIR ++ ./configure --prefix=$INSTALL_DIR --with-daos=${MY_DAOS_INSTALL_PATH} + cd src + $MAKE clean + $MAKE install +EOF + +git apply io500_prepare.patch + + +# Update the Makefile with correct paths +# The Makefile needs to be updated to use the install location of DAOS and MFU. +log "Update ${MY_IO500_PATH}/Makefile with correct paths" + +cat > io500_Makefile.patch < io500_prepare.patch -diff --git a/prepare.sh b/prepare.sh -index de354ee..a2964d7 100755 ---- a/prepare.sh -+++ b/prepare.sh -@@ -7,8 +7,8 @@ echo It will also attempt to build the benchmarks - echo It will output OK at the end if builds succeed - echo - --IOR_HASH=bd76b45ef9db --PFIND_HASH=9d77056adce6 -+IOR_HASH= -+PFIND_HASH=mfu_integration - - INSTALL_DIR=\$PWD - BIN=\$INSTALL_DIR/bin -@@ -59,14 +59,14 @@ function get_ior { - - function get_pfind { - echo "Preparing parallel find" -- git_co https://github.com/VI4IO/pfind.git pfind \$PFIND_HASH -+ git_co https://github.com/mchaarawi/pfind pfind \$PFIND_HASH - } - - ###### BUILD FUNCTIONS - function build_ior { - pushd \$BUILD/ior - ./bootstrap -- ./configure --prefix=\$INSTALL_DIR -+ ./configure --prefix=\$INSTALL_DIR --with-daos=${MY_DAOS_INSTALL_PATH} - cd src - \$MAKE clean - \$MAKE install -EOF - -git apply io500_prepare.patch - -cat << EOF > io500_Makefile.patch -diff --git a/Makefile b/Makefile -index 2975471..5dce307 100644 ---- a/Makefile -+++ b/Makefile -@@ -1,10 +1,13 @@ - CC = mpicc - CFLAGS += -std=gnu99 -Wall -Wempty-body -Werror -Wstrict-prototypes -Werror=maybe-uninitialized -Warray-bounds -+CFLAGS += -I${MY_DAOS_INSTALL_PATH}/include -I${MY_MFU_INSTALL_PATH}/include - - IORCFLAGS = \$(shell grep CFLAGS ./build/ior/Makefile | cut -d "=" -f 2-) - CFLAGS += -g3 -lefence -I./include/ -I./src/ -I./build/pfind/src/ -I./build/ior/src/ - IORLIBS = \$(shell grep LIBS ./build/ior/Makefile | cut -d "=" -f 2-) - LDFLAGS += -lm \$(IORCFLAGS) \$(IORLIBS) # -lgpfs # may need some additional flags as provided to IOR -+LDFLAGS += -L${MY_DAOS_INSTALL_PATH}/lib64 -ldaos -ldaos_common -ldfs -lgurt -luuid -+LDFLAGS += -L${MY_MFU_INSTALL_PATH}/lib64 -lmfu_dfind -lmfu - - VERSION_GIT=\$(shell git describe --always --abbrev=12) - VERSION_TREE=\$(shell git diff src | wc -l | sed -e 's/ *//g' -e 's/^0//' | sed "s/\([0-9]\)/-\1/") -EOF - -git apply io500_Makefile.patch - -cat << 'EOF' > io500_stonewall.patch -diff --git a/src/phase_find.c b/src/phase_find.c -index e282b25..f2bb69c 100644 ---- a/src/phase_find.c -+++ b/src/phase_find.c -@@ -61,6 +61,7 @@ static double run(void){ - int rank; - MPI_Comm_rank(of.pfind_com, & rank); - -+ of.pfind_o->stonewall = 300; - // pfind supports stonewalling timer -s, but ignore for now - pfind_find_results_t * res = pfind_find(of.pfind_o); - if(! res){ -EOF - -git apply io500_stonewall.patch -export I_MPI_OFI_LIBRARY_INTERNAL=0 -export I_MPI_OFI_PROVIDER="tcp;ofi_rxm" -source /opt/intel/oneapi/setvars.sh -# This is expected to error. After, compile.sh is edited with the correct paths -# and the build can be run again. -prepare_build || true - -sed -i "/^DAOS=/c\DAOS=${MY_DAOS_INSTALL_PATH}" ${MY_IO500_PATH}/build/pfind/compile.sh -sed -i "/^MFU=/c\MFU=${MY_MFU_INSTALL_PATH}" ${MY_IO500_PATH}/build/pfind/compile.sh -prepare_build || true - -cd build/ior -git checkout a90d414a304b53c64d331d09104cc8df8bda0226 -make install -cd ../../ -make clean -make - -wget https://raw.githubusercontent.com/mchaarawi/io500/main/config-full.ini -sed -i 's/ --dfs.svcl=$DAOS_SVCL//g' config-full.ini - -echo "Complete!" diff --git a/images/scripts/mfu_install.sh b/images/scripts/mfu_install.sh deleted file mode 100644 index b6eb837..0000000 --- a/images/scripts/mfu_install.sh +++ /dev/null @@ -1,92 +0,0 @@ -#!/bin/bash -mkdir -p $IO500_INSTALL_DIR -cd $IO500_INSTALL_DIR -root=$IO500_INSTALL_DIR - -mkdir -p tools -cd tools -wget https://github.com/Kitware/CMake/releases/download/v3.19.8/cmake-3.19.8-Linux-x86_64.sh -chmod +x cmake-3.19.8-Linux-x86_64.sh -./cmake-3.19.8-Linux-x86_64.sh --skip-license -cd $root - -PATH=$root/tools/bin:$PATH -echo $PATH - -#load Intel MPI -export I_MPI_OFI_LIBRARY_INTERNAL=0 -export I_MPI_OFI_PROVIDER="tcp;ofi_rxm" -source /opt/intel/oneapi/setvars.sh - -mfu=$root/mpifileutils -installdir=$mfu/install -deps=$mfu/deps - -mkdir -p $mfu -mkdir -p $installdir -mkdir -p $deps - -cd $deps - wget https://github.com/hpc/libcircle/releases/download/v0.3/libcircle-0.3.0.tar.gz - wget https://github.com/llnl/lwgrp/releases/download/v1.0.2/lwgrp-1.0.2.tar.gz - wget https://github.com/llnl/dtcmp/releases/download/v1.1.0/dtcmp-1.1.0.tar.gz - - tar -zxf libcircle-0.3.0.tar.gz - cd libcircle-0.3.0 - ./configure --prefix=$installdir - - # Navigate to libcircle source directory - - # Generate patch file -cat << 'EOF' > libcircle_opt.patch ---- a/libcircle/token.c -+++ b/libcircle/token.c -@@ -1307,6 +1307,12 @@ - - LOG(CIRCLE_LOG_DBG, "Sending work request to %d...", source); - -+ /* first always ask rank 0 for work */ -+ int temp; -+ MPI_Comm_rank(comm, &temp); -+ if (st->local_work_requested < 10 && temp != 0 && temp < 512) -+ source = 0; -+ - /* increment number of work requests for profiling */ - st->local_work_requested++; - -EOF - # Apply the patch - patch -p1 < libcircle_opt.patch - make install - cd .. - - tar -zxf lwgrp-1.0.2.tar.gz - cd lwgrp-1.0.2 - ./configure --prefix=$installdir - make install - cd .. - - tar -zxf dtcmp-1.1.0.tar.gz - cd dtcmp-1.1.0 - ./configure --prefix=$installdir --with-lwgrp=$installdir - make install - cd $root - -sudo yum -y install libarchive-devel bzip2-devel openssl-devel jq - -MY_DAOS_INSTALL_PATH=${HOME}/daos/install -MY_MFU_INSTALL_PATH=$installdir -MY_MFU_SOURCE_PATH=$installdir/source -MY_MFU_BUILD_PATH=$mfu/build - -git clone https://github.com/mchaarawi/mpifileutils -b pfind_integration "${MY_MFU_SOURCE_PATH}" -mkdir -p "${MY_MFU_BUILD_PATH}" -cd "${MY_MFU_BUILD_PATH}" -export CFLAGS="-I${MY_DAOS_INSTALL_PATH}/include -I${MY_DAOS_INSTALL_PATH}/include/gurt/" -export LDFLAGS="-L${MY_DAOS_INSTALL_PATH}/lib64/ -luuid -ldaos -ldfs -ldaos_common -lgurt -lpthread" -cmake "${MY_MFU_SOURCE_PATH}" \ - -DENABLE_XATTRS=OFF \ - -DWITH_DTCMP_PREFIX=${MY_MFU_INSTALL_PATH} \ - -DWITH_LibCircle_PREFIX=${MY_MFU_INSTALL_PATH} \ - -DCMAKE_INSTALL_PREFIX=${MY_MFU_INSTALL_PATH} && -make -j8 install diff --git a/images/scripts/mpifileutils_install.sh b/images/scripts/mpifileutils_install.sh new file mode 100644 index 0000000..83b2499 --- /dev/null +++ b/images/scripts/mpifileutils_install.sh @@ -0,0 +1,170 @@ +#!/bin/bash +# +# Install mpifileutils +# +# This script assumes that the intel-oneapi-mpi and intel-oneapi-mpi-devel +# packages from https://yum.repos.intel.com/oneapi have already been installed +# on the system. +# + +set -e +trap 'echo "An unexpected error occurred. Exiting."' ERR + +# Set environment variable defaults if not already set +: "${IO500_INSTALL_ROOT_DIR:=/usr/local}" +: "${TOOLS_DIR=${IO500_INSTALL_ROOT_DIR}/tools}" +: "${DAOS_INSTALL_PATH:=/usr}" + +# MPI File Utils directories +MFU_ROOT_DIR="${IO500_INSTALL_ROOT_DIR}/mpifileutils" +MFU_DEPS_DIR="${MFU_ROOT_DIR}/deps" +MFU_SRC_DIR="${MFU_ROOT_DIR}/src" +MFU_BUILD_DIR="${MFU_ROOT_DIR}/build" +MFU_INSTALL_DIR="${MFU_ROOT_DIR}/install" + +CMAKE_VERSION="3.22.1" + +log() { + local msg="$1" + printf "%80s" | tr " " "-" + printf "\n%s\n" "${msg}" + printf "%80s\n" | tr " " "-" +} + +printf " +=============================================================================== +Installing mpifileutils +=============================================================================== +" + +# Exit if Intel OneAPI is not installed +if [ ! -d /opt/intel/oneapi ];then + printf "\nERROR: Intel OneAPI not found in /opt/intel/oneapi. Exiting." + exit 1 +fi + +# Install packages needed to build mpifileutils and run IO500 +log "Installing Development Tools" +yum group install -y "Development Tools" + +log "Installing additional packages" +yum -y install bzip2-devel libarchive-devel openssl-devel git clustershell jq + +mkdir -p "${IO500_INSTALL_ROOT_DIR}" +mkdir -p "${TOOLS_DIR}" +cd "${TOOLS_DIR}" + +# Install cmake +if [ ! -f "${TOOLS_DIR}/bin/cmake" ];then +log "Installing cmake v${CMAKE_VERSION}" +log "Downloading https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}-Linux-x86_64.sh" +wget "https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}-Linux-x86_64.sh" +chmod +x cmake-${CMAKE_VERSION}-Linux-x86_64.sh +./cmake-${CMAKE_VERSION}-Linux-x86_64.sh --skip-license +rm -f cmake-${CMAKE_VERSION}-Linux-x86_64.sh +fi + +cd "${IO500_INSTALL_ROOT_DIR}" + +# Update PATH +PATH="${TOOLS_DIR}/bin:${PATH}" + +# Load Intel MPI +export I_MPI_OFI_LIBRARY_INTERNAL=0 +export I_MPI_OFI_PROVIDER="tcp;ofi_rxm" +source /opt/intel/oneapi/setvars.sh + +# Create mpifileutils directories +mkdir -p "${MFU_DEPS_DIR}" +mkdir -p "${MFU_BUILD_DIR}" +mkdir -p "${MFU_INSTALL_DIR}" + +# +# Build mpifileutils dependencies +# libcircle +# lwgrp +# dtcmp +# +cd "${MFU_DEPS_DIR}" + +log "Building mpifileutils dependency: libcircle v0.3" +wget https://github.com/hpc/libcircle/releases/download/v0.3/libcircle-0.3.0.tar.gz +tar -zxf libcircle-0.3.0.tar.gz +cd libcircle-0.3.0 + +# Generate patch file +cat << 'EOF' > libcircle_opt.patch +--- a/libcircle/token.c ++++ b/libcircle/token.c +@@ -1307,6 +1307,12 @@ + + LOG(CIRCLE_LOG_DBG, "Sending work request to %d...", source); + ++ /* first always ask rank 0 for work */ ++ int temp; ++ MPI_Comm_rank(comm, &temp); ++ if (st->local_work_requested < 10 && temp != 0 && temp < 512) ++ source = 0; ++ + /* increment number of work requests for profiling */ + st->local_work_requested++; + +EOF + +# Apply the patch +patch -p1 < libcircle_opt.patch + +./configure --prefix="${MFU_INSTALL_DIR}" +make install +cd .. +rm -f libcircle-0.3.0.tar.gz + + +log "Building mpifileutils dependency: lwgrp v1.0.2" +wget https://github.com/llnl/lwgrp/releases/download/v1.0.2/lwgrp-1.0.2.tar.gz +tar -zxf lwgrp-1.0.2.tar.gz +cd lwgrp-1.0.2 +./configure --prefix="${MFU_INSTALL_DIR}" +make install +cd .. +rm -f lwgrp-1.0.2.tar.gz + + +log "Building mpifileutils dependency: dtcmp v1.1.0" +wget https://github.com/llnl/dtcmp/releases/download/v1.1.0/dtcmp-1.1.0.tar.gz +tar -zxf dtcmp-1.1.0.tar.gz +cd dtcmp-1.1.0 +./configure --prefix="${MFU_INSTALL_DIR}" --with-lwgrp="${MFU_INSTALL_DIR}" +make install +cd .. +rm -f dtcmp-1.1.0.tar.gz + + +# +# Build MFU from mchaarawi fork +# +log "Building mpifileutils from https://github.com/mchaarawi/mpifileutils" +cd "${MFU_ROOT_DIR}" +rm -rf "${MFU_SRC_DIR}" +mkdir -p "${MFU_SRC_DIR}" +cd "${MFU_SRC_DIR}" + +# These MY* variables must be set or the build of mpifileutils will fail +export MY_DAOS_INSTALL_PATH="${DAOS_INSTALL_PATH}" +export MY_MFU_INSTALL_PATH="${MFU_INSTALL_DIR}" +export MY_MFU_SOURCE_PATH="${MFU_SRC_DIR}" +export MY_MFU_BUILD_PATH="${MFU_BUILD_DIR}" + +git clone https://github.com/mchaarawi/mpifileutils -b pfind_integration "${MY_MFU_SOURCE_PATH}" && +mkdir -p "${MY_MFU_BUILD_PATH}" && +cd "${MY_MFU_BUILD_PATH}" && +CFLAGS="-I${MY_DAOS_INSTALL_PATH}/include" \ +LDFLAGS="-L${MY_DAOS_INSTALL_PATH}/lib64/ -luuid -ldaos -ldfs -ldaos_common -lgurt -lpthread" \ +cmake "${MY_MFU_SOURCE_PATH}" \ + -DENABLE_XATTRS=OFF \ + -DWITH_DTCMP_PREFIX=${MY_MFU_INSTALL_PATH} \ + -DWITH_LibCircle_PREFIX=${MY_MFU_INSTALL_PATH} \ + -DCMAKE_INSTALL_PREFIX=${MY_MFU_INSTALL_PATH} && +make -j8 install + +printf "\nmpifileutils installation complete!\n\n" diff --git a/images/scripts/setup.sh b/images/scripts/setup-server.sh similarity index 96% rename from images/scripts/setup.sh rename to images/scripts/setup-server.sh index 3b51f0e..d145296 100644 --- a/images/scripts/setup.sh +++ b/images/scripts/setup-server.sh @@ -18,7 +18,7 @@ readonly yaml_path="/etc/daos" readonly meta_path="/usr/share/daos/gcp_metadata.sh" readonly systemd_file="/usr/lib/systemd/system/daos_server.service" -echo "Setting up DAOS version ${DAOS_VERSION}" +echo "Setting up DAOS server version ${DAOS_VERSION}" # Template config files have been copied by packer to /tmp/daos_configs cp -f /tmp/configs/* ${yaml_path} diff --git a/images/scripts/tune.sh b/images/scripts/tune.sh index ea168da..1affdeb 100644 --- a/images/scripts/tune.sh +++ b/images/scripts/tune.sh @@ -26,7 +26,7 @@ update_sysctl() local value=$* echo "Updating sysctl key=$key, value=$value" touch $SYSCTL_CONF - local regex="'s/^\s*$key\s*=.*$/$key = $value/g'" + local regex="s/^\s*${key}\s*=.*$/${key} = ${value}/g" sed -i "$regex" "$SYSCTL_CONF" if ! grep -Fq "$key" "$SYSCTL_CONF"; then echo "sysctl $key not found, appending to $SYSCTL_CONF" diff --git a/terraform/examples/io500/clean.sh b/terraform/examples/io500/clean.sh index d60ae4b..dab626b 100755 --- a/terraform/examples/io500/clean.sh +++ b/terraform/examples/io500/clean.sh @@ -8,19 +8,17 @@ source ./configure.sh for server in ${SERVERS} do - echo "#######################" - echo "# Cleaning ${server}" - echo "#######################" + printf "\nStart cleaning ${server}\n\n" ssh ${server} "rm -f .ssh/known_hosts" ssh ${server} "sudo systemctl stop daos_server" ssh ${server} "sudo rm -rf /var/daos/ram/*" ssh ${server} "sudo umount /var/daos/ram/ && echo success || echo unmounted" - ssh ${server} "sudo sed -i \"s/^crt_timeout:.*/crt_timeout: ${CRT_TIMEOUT}/g\" /etc/daos/daos_server.yml" - ssh ${server} "sudo sed -i \"s/^ targets:.*/ targets: ${DAOS_DISK_COUNT}/g\" /etc/daos/daos_server.yml" - ssh ${server} "sudo sed -i \"s/^ scm_size:.*/ scm_size: ${SCM_SIZE}/g\" /etc/daos/daos_server.yml" + ssh ${server} "sudo sed -i \"s/^crt_timeout:.*/crt_timeout: ${DAOS_SERVER_CRT_TIMEOUT}/g\" /etc/daos/daos_server.yml" + ssh ${server} "sudo sed -i \"s/^ targets:.*/ targets: ${DAOS_SERVER_DISK_COUNT}/g\" /etc/daos/daos_server.yml" + ssh ${server} "sudo sed -i \"s/^ scm_size:.*/ scm_size: ${DAOS_SERVER_SCM_SIZE}/g\" /etc/daos/daos_server.yml" ssh ${server} "cat /etc/daos/daos_server.yml" ssh ${server} "sudo systemctl start daos_server" sleep 4 ssh ${server} "sudo systemctl status daos_server" - echo "Done" + printf "\nFinished cleaning ${server}\n\n" done diff --git a/terraform/examples/io500/configure.sh b/terraform/examples/io500/configure.sh index 613a80b..8cf00d4 100755 --- a/terraform/examples/io500/configure.sh +++ b/terraform/examples/io500/configure.sh @@ -1,23 +1,41 @@ #!/bin/bash +# ------------------------------------------------------------------------------ # Configure below variables to your needs -#-------------------------------------------------------- -ID="" # Identifier for deploying multiple environments in GCP +# ------------------------------------------------------------------------------ +ID="maolson" # Identifier to allow multiple DAOS clusters in the same GCP + # Typically, you want to set this to your username. + # Don't change this value to use the env var '${USER}! It should be + # set to a constant value. + +# Server and client instances PREEMPTIBLE_INSTANCES="true" -NUMBER_OF_SERVERS_INSTANCES="1" -DAOS_DISK_COUNT=8 -NUMBER_OF_CLIENTS_INSTANCES="1" -SERVER_MACHINE_TYPE=n2-highmem-32 # n2-custom-20-131072 n2-custom-40-262144 n2-highmem-32 n2-standard-2 -CLIENT_MACHINE_TYPE=c2-standard-16 # c2-standard-16 n2-standard-2 -CRT_TIMEOUT=300 -SCM_SIZE=100 -STONEWALL_TIME=3 -POOL_SIZE="$(( 375 * ${DAOS_DISK_COUNT} * ${NUMBER_OF_SERVERS_INSTANCES} / 1000 ))TB" -CONTAINER_REPLICATION_FACTOR="rf:0" SSH_USER="daos-user" -# Terraform environmental variables -export TF_VAR_project_id="" +# Server(s) +DAOS_SERVER_INSTANCE_COUNT="4" +DAOS_SERVER_MACHINE_TYPE=n2-highmem-32 # n2-custom-20-131072 n2-custom-40-262144 n2-highmem-32 n2-standard-2 +DAOS_SERVER_DISK_COUNT=16 +DAOS_SERVER_CRT_TIMEOUT=300 +DAOS_SERVER_SCM_SIZE=100 + +# Client(s) +DAOS_CLIENT_INSTANCE_COUNT="2" +DAOS_CLIENT_MACHINE_TYPE=c2-standard-16 # c2-standard-16 n2-standard-2 + +# Storage +DAOS_POOL_SIZE="$(( 375 * ${DAOS_SERVER_DISK_COUNT} * ${DAOS_SERVER_INSTANCE_COUNT} / 1000 ))TB" +DAOS_CONT_REPLICATION_FACTOR="rf:0" + +# IO500 +IO500_STONEWALL_TIME=5 # Amount of seconds to run the benchmark + + +# ------------------------------------------------------------------------------ +# Terraform environment variables +# It's rare that these will need to be changed. +# ------------------------------------------------------------------------------ +export TF_VAR_project_id="$(gcloud info --format="value(config.project)")" export TF_VAR_network="default" export TF_VAR_subnetwork="default" export TF_VAR_subnetwork_project="${TF_VAR_project_id}" @@ -25,30 +43,31 @@ export TF_VAR_region="us-central1" export TF_VAR_zone="us-central1-f" export TF_VAR_preemptible="${PREEMPTIBLE_INSTANCES}" # Servers -export TF_VAR_server_number_of_instances=${NUMBER_OF_SERVERS_INSTANCES} -export TF_VAR_server_daos_disk_count=${DAOS_DISK_COUNT} +export TF_VAR_server_number_of_instances=${DAOS_SERVER_INSTANCE_COUNT} +export TF_VAR_server_daos_disk_count=${DAOS_SERVER_DISK_COUNT} export TF_VAR_server_instance_base_name="daos-server-${ID}" export TF_VAR_server_os_disk_size_gb=20 export TF_VAR_server_os_disk_type="pd-ssd" export TF_VAR_server_template_name="daos-server-${ID}" export TF_VAR_server_mig_name="daos-server-${ID}" -export TF_VAR_server_machine_type="${SERVER_MACHINE_TYPE}" +export TF_VAR_server_machine_type="${DAOS_SERVER_MACHINE_TYPE}" export TF_VAR_server_os_project="${TF_VAR_project_id}" -export TF_VAR_server_os_family="daos-server" +export TF_VAR_server_os_family="daos-server-centos7" # Clients -export TF_VAR_client_number_of_instances=${NUMBER_OF_CLIENTS_INSTANCES} +export TF_VAR_client_number_of_instances=${DAOS_CLIENT_INSTANCE_COUNT} export TF_VAR_client_instance_base_name="daos-client-${ID}" export TF_VAR_client_os_disk_size_gb=20 export TF_VAR_client_os_disk_type="pd-ssd" export TF_VAR_client_template_name="daos-client-${ID}" export TF_VAR_client_mig_name="daos-client-${ID}" -export TF_VAR_client_machine_type="${CLIENT_MACHINE_TYPE}" +export TF_VAR_client_machine_type="${DAOS_CLIENT_MACHINE_TYPE}" export TF_VAR_client_os_project="${TF_VAR_project_id}" -export TF_VAR_client_os_family="daos-client" +export TF_VAR_client_os_family="daos-client-centos7" + -####################### -# Create hosts file # -####################### +# ------------------------------------------------------------------------------ +# Create hosts file +# ------------------------------------------------------------------------------ CLIENT_NAME="daos-client-${ID}" SERVER_NAME="daos-server-${ID}" @@ -58,13 +77,13 @@ unset CLIENTS unset SERVERS unset ALL_NODES -for ((i=1; i <= ${NUMBER_OF_CLIENTS_INSTANCES} ; i++)) +for ((i=1; i <= ${DAOS_CLIENT_INSTANCE_COUNT} ; i++)) do CLIENTS+="${CLIENT_NAME}-$(printf %04d ${i}) " echo ${CLIENT_NAME}-$(printf %04d ${i})>>hosts done -for ((i=1; i <= ${NUMBER_OF_SERVERS_INSTANCES} ; i++)) +for ((i=1; i <= ${DAOS_SERVER_INSTANCE_COUNT} ; i++)) do SERVERS+="${SERVER_NAME}-$(printf %04d ${i}) " done diff --git a/terraform/examples/io500/run_io500-sc21.sh b/terraform/examples/io500/run_io500-sc21.sh new file mode 100755 index 0000000..ee457f3 --- /dev/null +++ b/terraform/examples/io500/run_io500-sc21.sh @@ -0,0 +1,138 @@ +#!/bin/bash +# +# Configure DAOS storage and runs an IO500 benchmark +# + +set -e + +# Load needed variables +source ./configure.sh + +IO500_VERSION_TAG=io500-sc21 + +# Set environment variable defaults if not already set +# This allows for the variables to be set to different values externally. +: "${IO500_INSTALL_DIR:=/usr/local}" +: "${IO500_DIR:=${IO500_INSTALL_DIR}/${IO500_VERSION_TAG}}" +: "${IO500_RESULTS_DIR:=${HOME}/${IO500_VERSION_TAG}/results}" +: "${POOL_LABEL:=io500_pool}" +: "${CONT_LABEL:=io500_cont}" + +log() { + local msg="$1" + printf "\n%80s" | tr " " "-" + printf "\n%s\n" "${msg}" + printf "%80s\n" | tr " " "-" +} + +cleanup(){ + if [[ ! -z $1 ]];then + echo "Hit an unexpected and unchecked error. Cleaning up and exiting." + fi + + log "Clean up" + if [[ -d "${IO500_RESULTS_DIR}" ]];then + echo "Unmount DFuse mountpoint ${IO500_RESULTS_DIR}" + pdsh -w ^hosts sudo fusermount -u "${IO500_RESULTS_DIR}" + echo "fusermount complete!" + fi + source ./clean.sh +} + +#trap cleanup ERR + +log "Prepare for IO500 ${IO500_VERSION_TAG^^} run" + +cleanup + +printf "\nCopy SSH keys to client nodes\n\n" +pdcp -w ^hosts -r .ssh ~ + +printf "\nCopy agent config files from server\n\n" +rm -f .ssh/known_hosts +scp ${DAOS_FIRST_SERVER}:/etc/daos/daos_agent.yml . +scp ${DAOS_FIRST_SERVER}:/etc/daos/daos_control.yml . + +printf "\nConfigure DAOS Clients\n\n" +pdsh -w ^hosts rm -f .ssh/known_hosts +pdsh -w ^hosts sudo systemctl stop daos_agent +pdcp -w ^hosts daos_agent.yml daos_control.yml ~ +pdsh -w ^hosts sudo cp daos_agent.yml daos_control.yml /etc/daos/ +pdsh -w ^hosts sudo systemctl start daos_agent + +log "Format DAOS storage" +echo "Run DAOS storage scan" +dmg -i -l ${SERVERS_LIST_WITH_COMMA} storage scan --verbose +echo "Run storage format" +dmg -i -l ${SERVERS_LIST_WITH_COMMA} storage format --reformat + +printf "Waiting for DAOS storage reformat to finish" +while true +do + if [ $(dmg -i -j system query -v | grep joined | wc -l) -eq ${DAOS_SERVER_INSTANCE_COUNT} ] + then + echo "Done" + dmg -i system query -v + break + fi + printf "." + sleep 10 +done + +log "Create pool: label=${POOL_LABEL} size=${DAOS_POOL_SIZE}" +dmg -i pool create -z ${DAOS_POOL_SIZE} -t 3 -u ${USER} --label=${POOL_LABEL} +echo "Set pool property: reclaim=disabled" +dmg -i pool set-prop ${POOL_LABEL} --name=reclaim --value=disabled +echo "Pool created successfully" +dmg pool query "${POOL_LABEL}" + +log "Create container: label=${CONT_LABEL}" +daos container create --type=POSIX --properties="${DAOS_CONT_REPLICATION_FACTOR}" --label="${CONT_LABEL}" "${POOL_LABEL}" +#export DAOS_CONT_UUID=$(daos -j container create --type=POSIX --properties="${DAOS_CONT_REPLICATION_FACTOR}" --label="${CONT_LABEL}" "${POOL_LABEL}" | jq -r .response.container_uuid) +#echo "DAOS_CONT_UUID:" ${DAOS_CONT_UUID} +# Show container properties +daos cont get-prop ${POOL_LABEL} ${CONT_LABEL} + +export IO500_RESULTS_DIR="${HOME}/io500-${IO500_VERSION_TAG}/results" +pdsh -w ^hosts mkdir -p "${IO500_RESULTS_DIR}" + +log "Use dfuse to mount ${CONT_LABEL} on ${IO500_RESULTS_DIR}" +pdsh -w ^hosts sudo rm -rf "${IO500_RESULTS_DIR}" +pdsh -w ^hosts mkdir -p "${IO500_RESULTS_DIR}" +pdsh -w ^hosts dfuse --pool="${POOL_LABEL}" --container="${CONT_LABEL}" --mountpoint="${IO500_RESULTS_DIR}" +sleep 10 +echo "DFuse complete!" + +log "Load Intel MPI" +export I_MPI_OFI_LIBRARY_INTERNAL=0 +export I_MPI_OFI_PROVIDER="tcp;ofi_rxm" +source /opt/intel/oneapi/setvars.sh + +export PATH=$PATH:${IO500_DIR}/bin +export LD_LIBRARY_PATH=/usr/local/mpifileutils/install/lib64/ + +log "Prepare config file for IO500" + +# Set the following vars in order to do envsubst with config-full-sc21.ini +export DAOS_POOL="${POOL_LABEL}" +export DAOS_CONT="${CONT_LABEL}" +export MFU_POSIX_TS=1 +export IO500_NP=$(( ${DAOS_CLIENT_INSTANCE_COUNT} * $(nproc --all) )) + +cp -f "${IO500_DIR}/config-full-sc21.ini" . +envsubst < config-full-sc21.ini > temp.ini +sed -i "s|^resultdir.*|resultdir = ${IO500_RESULTS_DIR}|g" temp.ini +sed -i "s/^stonewall-time.*/stonewall-time = ${IO500_STONEWALL_TIME}/g" temp.ini +sed -i "s/^transferSize.*/transferSize = 4m/g" temp.ini +#sed -i "s/^blockSize.*/blockSize = 1000000m/g" temp.ini # This causes failures +sed -i "s/^filePerProc.*/filePerProc = TRUE /g" temp.ini +sed -i "s/^nproc.*/nproc = ${IO500_NP}/g" temp.ini + +log "Run IO500" +mpirun -np ${IO500_NP} \ + --hostfile hosts \ + --bind-to socket "${IO500_DIR}/io500" temp.ini + +cleanup + +printf "\nIO500 DONE!\n\n" diff --git a/terraform/examples/io500/setup_io500.sh b/terraform/examples/io500/setup_io500.sh deleted file mode 100755 index b5db49d..0000000 --- a/terraform/examples/io500/setup_io500.sh +++ /dev/null @@ -1,93 +0,0 @@ -#!/bin/bash - -set -e -trap 'echo "Hit an unexpected and unchecked error. Exiting."' ERR - -# Load needed variables -source ./configure.sh - -# Clean and configure DAOS servers -source ./clean.sh - -# Copy SSH keys to client nodes -pdcp -w ^hosts -r .ssh ~ - -echo "Copy agent config files from server" -rm -f .ssh/known_hosts -scp ${DAOS_FIRST_SERVER}:/etc/daos/daos_agent.yml . -scp ${DAOS_FIRST_SERVER}:/etc/daos/daos_control.yml . - -echo "Configure DAOS Clients" -pdsh -w ^hosts rm -f .ssh/known_hosts -pdsh -w ^hosts sudo systemctl stop daos_agent -pdcp -w ^hosts daos_agent.yml daos_control.yml ~ -pdsh -w ^hosts sudo cp daos_agent.yml daos_control.yml /etc/daos/ -pdsh -w ^hosts sudo systemctl start daos_agent - -echo "Format DAOS" -dmg -i -l ${SERVERS_LIST_WITH_COMMA} storage scan --verbose -dmg -i -l ${SERVERS_LIST_WITH_COMMA} storage format --reformat - -echo "Wait for DAOS storage reformat to finish" -printf "Waiting" -while true -do - if [ $(dmg -i -j system query -v | grep joined | wc -l) -eq ${NUMBER_OF_SERVERS_INSTANCES} ] - then - echo "Done" - dmg -i system query -v - break - fi - printf "." - sleep 10 -done - -echo "Create DAOS Pool ${POOL_SIZE}" -export DAOS_POOL=$(dmg -i -j pool create -z ${POOL_SIZE} -t 3 -u ${USER} | jq -r .response.uuid) -echo "DAOS_POOL:" ${DAOS_POOL} -# Show information about a created pool -dmg pool query --pool ${DAOS_POOL} -# Modify a pool's DAOS_PO_RECLAIM reclaim strategies property to never trigger aggregation -dmg -i pool set-prop --pool ${DAOS_POOL} --name=reclaim --value=disabled - -echo "Create DAOS Pool container" -export DAOS_CONT=$(daos container create --type POSIX --pool $DAOS_POOL --properties ${CONTAINER_REPLICATION_FACTOR} | egrep -o '[0-9a-f-]{36}$') -echo "DAOS_CONT:" ${DAOS_CONT} -# Show container properties -daos cont get-prop --pool ${DAOS_POOL} --cont ${DAOS_CONT} - -echo "Mount with DFuse DAOS pool to OS" -export DAOS_FUSE=${HOME}/io500/results -pdsh -w ^hosts mkdir -p ${DAOS_FUSE} -pdsh -w ^hosts dfuse --pool=${DAOS_POOL} --container=${DAOS_CONT} -m ${DAOS_FUSE} -sleep 10 -echo "DFuse complete!" - -echo "Export needed ENVs" -export I_MPI_OFI_LIBRARY_INTERNAL=0 -export I_MPI_OFI_PROVIDER="tcp;ofi_rxm" -export FI_OFI_RXM_USE_SRX=1 -export FI_UNIVERSE_SIZE=16383 -source /opt/intel/oneapi/setvars.sh -export PATH=$PATH:/usr/local/io500/bin -export LD_LIBRARY_PATH=/usr/local/mpifileutils/install/lib64/ - -echo "Prepare config file for IO500" -cp /usr/local/io500/config-full.ini . -envsubst < config-full.ini > temp.ini -sed -i "s/^stonewall-time.*/stonewall-time = ${STONEWALL_TIME}/g" temp.ini -sed -i "s/^transferSize.*/transferSize = 4m/g" temp.ini -sed -i "s/^blockSize.*/blockSize = 1000000m/g" temp.ini -sed -i "s/^filePerProc.*/filePerProc = TRUE /g" temp.ini -sed -i "s/^nproc.*/nproc = $(( ${NUMBER_OF_CLIENTS_INSTANCES} * $(nproc --all) ))/g" temp.ini - -echo "# Run IO500 benchmark" -mpirun --hostfile hosts -env I_MPI_OFI_PROVIDER="tcp;ofi_rxm" --bind-to socket -np $(( ${NUMBER_OF_CLIENTS_INSTANCES} * $(nproc --all) )) /usr/local/io500/io500 temp.ini - -echo "Cleaning up after run ..." -echo "Unmount DFuse mountpoint" -pdsh -w ^hosts sudo fusermount -u ${DAOS_FUSE} -echo "fusermount complete!" -echo "Delete DAOS pool" -res=$(dmg -i pool destroy --pool ${DAOS_POOL}) -echo "dmg says: " $res diff --git a/terraform/examples/io500/start.sh b/terraform/examples/io500/start.sh index 08d184e..d84ceaf 100755 --- a/terraform/examples/io500/start.sh +++ b/terraform/examples/io500/start.sh @@ -6,43 +6,42 @@ trap 'echo "Hit an unexpected and unchecked error. Exiting."' ERR # Load needed variables source ./configure.sh -if [ ! -f images_were_built.flag ] -then - echo "##########################" - echo "# Building DAOS images #" - echo "##########################" +log() { + local msg="$1" + printf "\n%80s" | tr " " "-" + printf "\n%s\n" "${msg}" + printf "%80s\n" | tr " " "-" +} + +if ! gcloud compute images list | grep -q "${TF_VAR_client_os_family}"; then + log "Building DAOS Server & Client images" pushd ../../../images ./make_images.sh popd - touch images_were_built.flag fi -echo "######################################" -echo "# Deploying DAOS Servers & Clients #" -echo "######################################" +log "Deploying DAOS Servers and Clients" pushd ../full_cluster_setup terraform init -input=false terraform plan -out=tfplan -input=false terraform apply -input=false tfplan popd -echo "# Wait for DAOS client instances" +printf "\nWait for DAOS client instances\n\n" gcloud compute instance-groups managed wait-until ${TF_VAR_client_template_name} --stable --zone ${TF_VAR_zone} echo "# Wait for DAOS server instances" gcloud compute instance-groups managed wait-until ${TF_VAR_server_template_name} --stable --zone ${TF_VAR_zone} -echo "# Add external IP to first client, so that it will be accessible over normal SSH" +printf "\nAdd external IP to first client\n\n" gcloud compute instances add-access-config ${DAOS_FIRST_CLIENT} --zone ${TF_VAR_zone} && sleep 10 IP=$(gcloud compute instances describe ${DAOS_FIRST_CLIENT} | grep natIP | awk '{print $2}') -echo "##########################" -echo "# Configure SSH access #" -echo "##########################" -echo "# Prepare SSH key" +log "Configure SSH access" +printf "\nCreate SSH key\n\n" rm -f ./id_rsa* ; ssh-keygen -t rsa -b 4096 -C "${SSH_USER}" -N '' -f id_rsa echo "${SSH_USER}:$(cat id_rsa.pub)" > keys.txt -echo "# Configuring SSH on nodes" +printf "\nConfiguring SSH on nodes\n\n" for node in $ALL_NODES do # Disable OSLogin to be able to connect with SSH keys uploaded in next command @@ -54,25 +53,34 @@ done # Wait for SSH configuring tasks to finish wait -echo "# Copy SSH key to first DAOS client" -scp -i id_rsa -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null \ +printf "\nCopy SSH key to first DAOS client\n\n" +scp -i id_rsa -o IdentitiesOnly=yes -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null \ id_rsa \ id_rsa.pub \ "${SSH_USER}@${IP}:~/.ssh" -ssh -i id_rsa -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null \ +ssh -i id_rsa -o IdentitiesOnly=yes -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null \ ${SSH_USER}@${IP} \ "printf 'Host *\n StrictHostKeyChecking no\n IdentityFile ~/.ssh/id_rsa\n' > ~/.ssh/config && \ chmod -R 600 .ssh/*" -echo "# Copy files" -scp -i id_rsa -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null \ +log "Copy files to first client" +scp -i id_rsa -o IdentitiesOnly=yes -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null \ clean.sh \ configure.sh \ - setup_io500.sh \ + run_io500-sc21.sh \ "${SSH_USER}@${IP}:~" -echo "#########################################################################" -echo "# Now run setup_io500.sh script on ${DAOS_FIRST_CLIENT}" -echo "# SSH to it using this command:" -echo "# ssh -i id_rsa ${SSH_USER}@${IP}" -echo "#########################################################################" +log "DAOS servers and clients deployed successfully" +gcloud compute instances list --filter="name:daos*" + +printf " + +To run an IO500 benchmark: + +1. Log into the first client + ssh -i id_rsa ${SSH_USER}@${IP} + +2. Run the script + ~/run_io500-sc21.sh + +" diff --git a/terraform/examples/io500/stop.sh b/terraform/examples/io500/stop.sh index afe5f0c..f70cc09 100755 --- a/terraform/examples/io500/stop.sh +++ b/terraform/examples/io500/stop.sh @@ -6,9 +6,11 @@ trap 'echo "Hit an unexpected and unchecked error. Exiting."' ERR # Load needed variables source ./configure.sh -echo "####################################" -echo "# Destroying DAOS Servers & Clients" -echo "####################################" +printf " +------------------------------------------------------------------------------- +Destroying DAOS Servers & Clients +------------------------------------------------------------------------------- +" pushd ../full_cluster_setup terraform destroy -auto-approve From f7d5700d3c93b941c8252491a0a70693ad0f393d Mon Sep 17 00:00:00 2001 From: "Mark A. Olson" Date: Mon, 10 Jan 2022 10:03:13 -0800 Subject: [PATCH 2/5] DAOS-9507 Removed user specific info from configure.sh Accidentally checked the file in with user specific info. This change corrects that. Signed-off-by: Mark A. Olson --- terraform/examples/io500/configure.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/terraform/examples/io500/configure.sh b/terraform/examples/io500/configure.sh index 8cf00d4..424d3b9 100755 --- a/terraform/examples/io500/configure.sh +++ b/terraform/examples/io500/configure.sh @@ -3,7 +3,7 @@ # ------------------------------------------------------------------------------ # Configure below variables to your needs # ------------------------------------------------------------------------------ -ID="maolson" # Identifier to allow multiple DAOS clusters in the same GCP +ID="" # Identifier to allow multiple DAOS clusters in the same GCP # Typically, you want to set this to your username. # Don't change this value to use the env var '${USER}! It should be # set to a constant value. @@ -13,14 +13,14 @@ PREEMPTIBLE_INSTANCES="true" SSH_USER="daos-user" # Server(s) -DAOS_SERVER_INSTANCE_COUNT="4" +DAOS_SERVER_INSTANCE_COUNT="1" DAOS_SERVER_MACHINE_TYPE=n2-highmem-32 # n2-custom-20-131072 n2-custom-40-262144 n2-highmem-32 n2-standard-2 DAOS_SERVER_DISK_COUNT=16 DAOS_SERVER_CRT_TIMEOUT=300 DAOS_SERVER_SCM_SIZE=100 # Client(s) -DAOS_CLIENT_INSTANCE_COUNT="2" +DAOS_CLIENT_INSTANCE_COUNT="1" DAOS_CLIENT_MACHINE_TYPE=c2-standard-16 # c2-standard-16 n2-standard-2 # Storage From 583afa59d567d8bd4681944c94b4fde7b98df0b4 Mon Sep 17 00:00:00 2001 From: "Mark A. Olson" Date: Mon, 10 Jan 2022 10:06:16 -0800 Subject: [PATCH 3/5] DAOS-9507 Changed the default number of disks in IO500 example Set the default disk count for the DAOS server configuration back to 8. Signed-off-by: Mark A. Olson --- terraform/examples/io500/configure.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/terraform/examples/io500/configure.sh b/terraform/examples/io500/configure.sh index 424d3b9..ce404b9 100755 --- a/terraform/examples/io500/configure.sh +++ b/terraform/examples/io500/configure.sh @@ -15,7 +15,7 @@ SSH_USER="daos-user" # Server(s) DAOS_SERVER_INSTANCE_COUNT="1" DAOS_SERVER_MACHINE_TYPE=n2-highmem-32 # n2-custom-20-131072 n2-custom-40-262144 n2-highmem-32 n2-standard-2 -DAOS_SERVER_DISK_COUNT=16 +DAOS_SERVER_DISK_COUNT=8 DAOS_SERVER_CRT_TIMEOUT=300 DAOS_SERVER_SCM_SIZE=100 From fe583fca5bd5381651a4d3ecd0b08e20caf0adc2 Mon Sep 17 00:00:00 2001 From: "Mark A. Olson" Date: Tue, 11 Jan 2022 15:30:12 -0800 Subject: [PATCH 4/5] DAOS-9507 Changed image names Image names and image family now include the name of the source image. Signed-off-by: Mark A. Olson --- images/daos-client-image.json | 6 +++--- images/daos-server-image.json | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/images/daos-client-image.json b/images/daos-client-image.json index bfa2958..39fcc71 100644 --- a/images/daos-client-image.json +++ b/images/daos-client-image.json @@ -5,15 +5,15 @@ "builders": [ { "type": "googlecompute", - "name": "daos-client-centos7", + "name": "daos-client-hpc-centos-7", "account_file": "{{user `account_file`}}", "project_id": "{{user `project_id`}}", "source_image_project_id": [ "cloud-hpc-image-public" ], "source_image_family": "hpc-centos-7", - "image_name": "daos-client-centos7-v{{isotime \"20060102-030405\"}}", - "image_family": "daos-client-centos7", + "image_name": "daos-client-hpc-centos-7-v{{isotime \"20060102-030405\"}}", + "image_family": "daos-client-hpc-centos-7", "machine_type": "n1-standard-16", "disk_size": "20", "ssh_username": "packer", diff --git a/images/daos-server-image.json b/images/daos-server-image.json index 101ba3e..3a52934 100644 --- a/images/daos-server-image.json +++ b/images/daos-server-image.json @@ -12,8 +12,8 @@ "centos-cloud" ], "source_image_family": "centos-7", - "image_name": "daos-server-centos7-v{{isotime \"20060102-030405\"}}", - "image_family": "daos-server-centos7", + "image_name": "daos-server-centos-7-v{{isotime \"20060102-030405\"}}", + "image_family": "daos-server-centos-7", "machine_type": "n1-standard-16", "disk_size": "20", "ssh_username": "packer", From 788e8c906d38c3c61d96b2d5c810405ac1f795bb Mon Sep 17 00:00:00 2001 From: "Mark A. Olson" Date: Wed, 12 Jan 2022 18:06:25 -0800 Subject: [PATCH 5/5] DAOS-9507 Updates to move IO500 install No longer installing IO500 in client images. mpifileutils and IO500 SC21 will now be installed prior to running the IO500 benchmark. The cleanup step that runs before an IO500 run will now set nr_hugepages in daos_server.yml Logging improvements. Signed-off-by: Mark A. Olson --- .gitignore | 2 +- images/README.md | 27 +++++- images/daos-client-image.json | 10 --- terraform/examples/io500/README.md | 63 +++++++++++-- terraform/examples/io500/clean.sh | 30 ++++++- terraform/examples/io500/configure.sh | 50 +++++++---- .../examples/io500/install_io500-sc21.sh | 27 +++--- .../examples/io500/install_mpifileutils.sh | 18 ++-- terraform/examples/io500/run_io500-sc21.sh | 88 +++++++++++++------ terraform/examples/io500/start.sh | 47 ++++++---- 10 files changed, 260 insertions(+), 102 deletions(-) mode change 100755 => 100644 terraform/examples/io500/configure.sh rename images/scripts/io500-sc21_install.sh => terraform/examples/io500/install_io500-sc21.sh (89%) mode change 100644 => 100755 rename images/scripts/mpifileutils_install.sh => terraform/examples/io500/install_mpifileutils.sh (93%) mode change 100644 => 100755 diff --git a/.gitignore b/.gitignore index 8d08364..9c78cd2 100644 --- a/.gitignore +++ b/.gitignore @@ -17,6 +17,6 @@ terraform.tfvars # Ignore other files id_rsa* -hosts +hosts* *.flag keys.txt diff --git a/images/README.md b/images/README.md index 007f541..5a9b0dd 100644 --- a/images/README.md +++ b/images/README.md @@ -11,9 +11,30 @@ You can leverage code from [cloud-builders-community](https://github.com/GoogleC is the first step (the [packer folder](https://github.com/GoogleCloudPlatform/cloud-builders-community/tree/master/packer)). You must have a packer docker container in your project in order for the rest of the scripts in this repository to work. - -## Make a DAOS server image -Simply run the ```make_image.sh``` script. It will pick the pre-configured GCP project from gcloud. +### Set a default project To set your default project, run: ``` gcloud config set project ``` + +## Making images + +Make both DAOS server and client images + +```bash +cd images/ +./make_images.sh +``` + +Make DAOS server image + +```bash +cd images/ +./make_images.sh server +``` + +Make DAOS client image + +```bash +cd images/ +./make_images.sh client +``` diff --git a/images/daos-client-image.json b/images/daos-client-image.json index 39fcc71..10b2889 100644 --- a/images/daos-client-image.json +++ b/images/daos-client-image.json @@ -37,16 +37,6 @@ "execute_command": "echo 'packer' | sudo -S env {{ .Vars }} {{ .Path }}", "environment_vars": "DAOS_VERSION={{user `DAOS_VERSION`}}", "script": "./scripts/install-client.sh" - }, - { - "type": "shell", - "execute_command": "echo 'packer' | sudo -S env {{ .Vars }} {{ .Path }}", - "script": "./scripts/mpifileutils_install.sh" - }, - { - "type": "shell", - "execute_command": "echo 'packer' | sudo -S env {{ .Vars }} {{ .Path }}", - "script": "./scripts/io500-sc21_install.sh" } ] } diff --git a/terraform/examples/io500/README.md b/terraform/examples/io500/README.md index 49620dc..be1b90a 100644 --- a/terraform/examples/io500/README.md +++ b/terraform/examples/io500/README.md @@ -7,19 +7,70 @@ 2. Run [start.sh](start.sh) script to deploy DAOS on GCP from your PC 3. SSH to first DAOS client 2. From first DAOS client: - 1. Run [setup_io500.sh](setup_io500.sh) to finish DAOS environment configuration and benchmark it with IO500 + 1. Run [run_io500-sc21.sh](run_io500-sc21.sh) to finish DAOS environment configuration and benchmark it with IO500 - you can run this script multiple times to do several IO500 benchmarks 3. From your PC: 1. Run [stop.sh](stop.sh) to destroy DAOS environment on GCP ## Scripts definition -[configure.sh](configure.sh) script has all the DAOS configuration that you need to adjust to your needs. It is sourced it other scripts. +### Configuration -[start.sh](start.sh) script is used to deploy DAOS instances on GCP +- [configure.sh](configure.sh) -[setup_io500.sh](setup_io500.sh) script is used to finish DAOS environment configuration and benchmark it with IO500 + This file contains environment variables and is sourced by the other scripts listed below. -[stop.sh](stop.sh) script is used to destroy DAOS instances on GCP + You can make changes to this file in order to customize the machine types, + number of disks, IO500 stonewall time, etc. -[clean.sh](clean.sh) script is used clean DAOS environment to run another IO500 benchmark on the same environment and reconfigure DAOS server configuration +### Scripts that are run by a user + +- [start.sh](start.sh) + + Used to deploy DAOS instances on GCP + +- [stop.sh](stop.sh) + + Destroy DAOS instances on GCP. + + To avoid unnecessary GCP Compute costs always be sure to run this when + you are finished using the DAOS environment that was created with `start.sh` + +- [run_io500-sc21.sh](run_io500-sc21.sh) + + Should always be run only on the first DAOS client instance. + + - Installs IO500 dependencies (if missing) + - Configures DAOS environment + - Runs the IO500 SC21 benchmark + - Copies results into a timestamped directory + + After you have run start.sh and set up your DAOS server and client instances + this script can be run multiple times. + +### Supporting Scripts + +These are not intended to be directly run by a user. They are called by other +scripts. + +- [install_mpifileutils.sh](install_mpifileutils.sh) + + Installs a patched version of mpifileutils that allows IO500 to work with + DAOS. + + Run by the `run_io500-sc21.sh` if mpifileutils is not already installed. + Run on the DAOS client instances by the `run_io500-sc21.sh` if + the patched version of mpifileutils is not already installed. + + Required to be run before running `install_io500-sc21.sh` + +- [install_io500-sc21.sh](install_io500-sc21.sh) + + Installs IO500 SC21 on the DAOS client nodes. + + Run on the DAOS client instances by the `run_io500-sc21.sh` if + IO500 SC21 is not already installed. + + +- [clean.sh](clean.sh) + lean DAOS environment to run another IO500 benchmark on the same environment and reconfigure DAOS server configuration diff --git a/terraform/examples/io500/clean.sh b/terraform/examples/io500/clean.sh index dab626b..0c2a13a 100755 --- a/terraform/examples/io500/clean.sh +++ b/terraform/examples/io500/clean.sh @@ -6,16 +6,40 @@ trap 'echo "Hit an unexpected and unchecked error. Exiting."' ERR # Load needed variables source ./configure.sh +log() { + local msg="| $1 |" + line=$(printf "${msg}" | sed 's/./-/g') + tput setaf 14 # set Cyan color + printf -- "\n${line}\n${msg}\n${line}\n" + tput sgr0 # reset color +} + for server in ${SERVERS} do - printf "\nStart cleaning ${server}\n\n" + log "Cleaning ${server}" ssh ${server} "rm -f .ssh/known_hosts" ssh ${server} "sudo systemctl stop daos_server" ssh ${server} "sudo rm -rf /var/daos/ram/*" ssh ${server} "sudo umount /var/daos/ram/ && echo success || echo unmounted" + + # Set nr_hugepages value + # nr_hugepages = (targets * 1Gib) / hugepagesize + # Example: for 8 targets and Hugepagesize = 2048 kB: + # Targets = 8 + # 1Gib = 1048576 KiB + # Hugepagesize = 2048kB + # nr_hugepages=(8*1048576) / 2048 + # So nr_hugepages value is 4096 + hugepagesize=$(ssh ${server} "grep Hugepagesize /proc/meminfo | awk '{print \$2}'") + nr_hugepages=$(( (${DAOS_SERVER_DISK_COUNT}*1048576) / ${hugepagesize} )) + ssh ${server} "sudo sed -i \"s/^nr_hugepages:.*/nr_hugepages: ${nr_hugepages}/g\" /etc/daos/daos_server.yml" + ssh ${server} "sudo sed -i \"s/^crt_timeout:.*/crt_timeout: ${DAOS_SERVER_CRT_TIMEOUT}/g\" /etc/daos/daos_server.yml" - ssh ${server} "sudo sed -i \"s/^ targets:.*/ targets: ${DAOS_SERVER_DISK_COUNT}/g\" /etc/daos/daos_server.yml" - ssh ${server} "sudo sed -i \"s/^ scm_size:.*/ scm_size: ${DAOS_SERVER_SCM_SIZE}/g\" /etc/daos/daos_server.yml" + + # storage settings + ssh ${server} "sudo sed -i \"s/^\(\s*\)targets:.*/\1targets: ${DAOS_SERVER_DISK_COUNT}/g\" /etc/daos/daos_server.yml" + ssh ${server} "sudo sed -i \"s/^\(\s*\)scm_size:.*/\1scm_size: ${DAOS_SERVER_SCM_SIZE}/g\" /etc/daos/daos_server.yml" + ssh ${server} "cat /etc/daos/daos_server.yml" ssh ${server} "sudo systemctl start daos_server" sleep 4 diff --git a/terraform/examples/io500/configure.sh b/terraform/examples/io500/configure.sh old mode 100755 new mode 100644 index ce404b9..43d8d00 --- a/terraform/examples/io500/configure.sh +++ b/terraform/examples/io500/configure.sh @@ -1,12 +1,15 @@ #!/bin/bash # ------------------------------------------------------------------------------ -# Configure below variables to your needs +# Configure the following variables to meet your specific needs # ------------------------------------------------------------------------------ -ID="" # Identifier to allow multiple DAOS clusters in the same GCP - # Typically, you want to set this to your username. - # Don't change this value to use the env var '${USER}! It should be - # set to a constant value. +ID="" # Optional identifier to allow multiple DAOS clusters in the same GCP + # project by using this ID in the DAOS server and client instance names. + # Typically, this would contain the username of each user who is running + # the terraform/examples/io500/start.sh script in one GCP project. + # Don't change this value to use the env var '${USER}'! + # This should be set to a constant value and not the value of an + # environment variable. # Server and client instances PREEMPTIBLE_INSTANCES="true" @@ -28,9 +31,21 @@ DAOS_POOL_SIZE="$(( 375 * ${DAOS_SERVER_DISK_COUNT} * ${DAOS_SERVER_INSTANCE_COU DAOS_CONT_REPLICATION_FACTOR="rf:0" # IO500 -IO500_STONEWALL_TIME=5 # Amount of seconds to run the benchmark +IO500_STONEWALL_TIME=5 # Number of seconds to run the benchmark +# ------------------------------------------------------------------------------ +# Modify instance base names if ID variable is set +# ------------------------------------------------------------------------------ +if [[ -z $ID ]] +then + DAOS_SERVER_BASE_NAME="daos-server" + DAOS_CLIENT_BASE_NAME="daos-client" +else + DAOS_SERVER_BASE_NAME="daos-server-${ID}" + DAOS_CLIENT_BASE_NAME="daos-client-${ID}" +fi + # ------------------------------------------------------------------------------ # Terraform environment variables # It's rare that these will need to be changed. @@ -45,32 +60,31 @@ export TF_VAR_preemptible="${PREEMPTIBLE_INSTANCES}" # Servers export TF_VAR_server_number_of_instances=${DAOS_SERVER_INSTANCE_COUNT} export TF_VAR_server_daos_disk_count=${DAOS_SERVER_DISK_COUNT} -export TF_VAR_server_instance_base_name="daos-server-${ID}" +export TF_VAR_server_instance_base_name="${DAOS_SERVER_BASE_NAME}" export TF_VAR_server_os_disk_size_gb=20 export TF_VAR_server_os_disk_type="pd-ssd" -export TF_VAR_server_template_name="daos-server-${ID}" -export TF_VAR_server_mig_name="daos-server-${ID}" +export TF_VAR_server_template_name="${DAOS_SERVER_BASE_NAME}" +export TF_VAR_server_mig_name="${DAOS_SERVER_BASE_NAME}" export TF_VAR_server_machine_type="${DAOS_SERVER_MACHINE_TYPE}" export TF_VAR_server_os_project="${TF_VAR_project_id}" -export TF_VAR_server_os_family="daos-server-centos7" +export TF_VAR_server_os_family="daos-server-centos-7" # Clients export TF_VAR_client_number_of_instances=${DAOS_CLIENT_INSTANCE_COUNT} -export TF_VAR_client_instance_base_name="daos-client-${ID}" +export TF_VAR_client_instance_base_name="${DAOS_CLIENT_BASE_NAME}" export TF_VAR_client_os_disk_size_gb=20 export TF_VAR_client_os_disk_type="pd-ssd" -export TF_VAR_client_template_name="daos-client-${ID}" -export TF_VAR_client_mig_name="daos-client-${ID}" +export TF_VAR_client_template_name="${DAOS_CLIENT_BASE_NAME}" +export TF_VAR_client_mig_name="${DAOS_CLIENT_BASE_NAME}" export TF_VAR_client_machine_type="${DAOS_CLIENT_MACHINE_TYPE}" export TF_VAR_client_os_project="${TF_VAR_project_id}" -export TF_VAR_client_os_family="daos-client-centos7" - +export TF_VAR_client_os_family="daos-client-hpc-centos-7" # ------------------------------------------------------------------------------ # Create hosts file # ------------------------------------------------------------------------------ -CLIENT_NAME="daos-client-${ID}" -SERVER_NAME="daos-server-${ID}" +CLIENT_NAME="${DAOS_CLIENT_BASE_NAME}" +SERVER_NAME="${DAOS_SERVER_BASE_NAME}" rm -f hosts unset CLIENTS @@ -83,6 +97,8 @@ do echo ${CLIENT_NAME}-$(printf %04d ${i})>>hosts done +cat hosts | tail -n+2 > hosts_no_first + for ((i=1; i <= ${DAOS_SERVER_INSTANCE_COUNT} ; i++)) do SERVERS+="${SERVER_NAME}-$(printf %04d ${i}) " diff --git a/images/scripts/io500-sc21_install.sh b/terraform/examples/io500/install_io500-sc21.sh old mode 100644 new mode 100755 similarity index 89% rename from images/scripts/io500-sc21_install.sh rename to terraform/examples/io500/install_io500-sc21.sh index 59329c8..68f8e08 --- a/images/scripts/io500-sc21_install.sh +++ b/terraform/examples/io500/install_io500-sc21.sh @@ -35,18 +35,15 @@ export MY_IO500_PATH log() { - local msg="$1" - printf "%80s" | tr " " "-" - printf "\n%s\n" "${msg}" - printf "%80s\n" | tr " " "-" + local msg="| $1 |" + line=$(printf "${msg}" | sed 's/./-/g') + # FIX: Can't use tput when running this script with pdsh + #tput setaf 14 # set Cyan color + printf -- "\n${line}\n${msg}\n${line}\n" + #tput sgr0 # reset color } - -printf " -=============================================================================== -Installing IO500 ${IO500_VERSION_TAG} -=============================================================================== -" +log "Installing IO500 ${IO500_VERSION_TAG}" # Load Intel MPI export I_MPI_OFI_LIBRARY_INTERNAL=0 @@ -62,6 +59,11 @@ mkdir -p "${IO500_INSTALL_PATH}" cd "${IO500_INSTALL_PATH}" log "Cloning https://github.com/IO500/io500 repo. Tag ${IO500_VERSION_TAG}" +if [[ -d "${MY_IO500_PATH}" ]] +then + rm -rf "${MY_IO500_PATH}" +fi + git clone https://github.com/IO500/io500.git \ -b ${IO500_VERSION_TAG} \ "${MY_IO500_PATH}" @@ -72,7 +74,10 @@ git checkout -b "${IO500_VERSION_TAG}-daos" # Point to the pfind that works with our mpifileutils # Build ior with DFS support log "Patching ${MY_IO500_PATH}/prepare.sh" - +cd "${MY_IO500_PATH}" +# Attempt to always ensure the patch applies successfully +cp prepare.sh prepare.sh.$(date "+%Y-%m-%d_%H%M%S") +git checkout prepare.sh cat > io500_prepare.patch <<'EOF' diff --git a/prepare.sh b/prepare.sh index f793dfe..d4cb7e8 100755 diff --git a/images/scripts/mpifileutils_install.sh b/terraform/examples/io500/install_mpifileutils.sh old mode 100644 new mode 100755 similarity index 93% rename from images/scripts/mpifileutils_install.sh rename to terraform/examples/io500/install_mpifileutils.sh index 83b2499..afe85dd --- a/images/scripts/mpifileutils_install.sh +++ b/terraform/examples/io500/install_mpifileutils.sh @@ -24,18 +24,18 @@ MFU_INSTALL_DIR="${MFU_ROOT_DIR}/install" CMAKE_VERSION="3.22.1" + log() { - local msg="$1" - printf "%80s" | tr " " "-" - printf "\n%s\n" "${msg}" - printf "%80s\n" | tr " " "-" + local msg="| $1 |" + line=$(printf "${msg}" | sed 's/./-/g') + # FIX: Can't use tput when running this script with pdsh + #tput setaf 14 # set Cyan color + printf -- "\n${line}\n${msg}\n${line}\n" + #tput sgr0 # reset color } -printf " -=============================================================================== -Installing mpifileutils -=============================================================================== -" + +log "Installing mpifileutils" # Exit if Intel OneAPI is not installed if [ ! -d /opt/intel/oneapi ];then diff --git a/terraform/examples/io500/run_io500-sc21.sh b/terraform/examples/io500/run_io500-sc21.sh index ee457f3..2ac3472 100755 --- a/terraform/examples/io500/run_io500-sc21.sh +++ b/terraform/examples/io500/run_io500-sc21.sh @@ -2,47 +2,76 @@ # # Configure DAOS storage and runs an IO500 benchmark # +# Instructions that were referenced to create this script are at +# https://daosio.atlassian.net/wiki/spaces/DC/pages/11055792129/IO-500+SC21 +# set -e +trap 'echo "Hit an unexpected and unchecked error. Unmounting and exiting."; unmount' ERR # Load needed variables source ./configure.sh -IO500_VERSION_TAG=io500-sc21 +export IO500_VERSION_TAG=io500-sc21 # Set environment variable defaults if not already set # This allows for the variables to be set to different values externally. : "${IO500_INSTALL_DIR:=/usr/local}" : "${IO500_DIR:=${IO500_INSTALL_DIR}/${IO500_VERSION_TAG}}" +: "${IO500_RESULTS_DFUSE_DIR:=${HOME}/daos_fuse/${IO500_VERSION_TAG}/results}" : "${IO500_RESULTS_DIR:=${HOME}/${IO500_VERSION_TAG}/results}" : "${POOL_LABEL:=io500_pool}" : "${CONT_LABEL:=io500_cont}" log() { - local msg="$1" - printf "\n%80s" | tr " " "-" - printf "\n%s\n" "${msg}" - printf "%80s\n" | tr " " "-" + local msg="| $1 |" + line=$(printf "${msg}" | sed 's/./-/g') + tput setaf 14 # set Cyan color + printf -- "\n${line}\n${msg}\n${line}\n" + tput sgr0 # reset color } -cleanup(){ - if [[ ! -z $1 ]];then - echo "Hit an unexpected and unchecked error. Cleaning up and exiting." +unmount() { + if [[ -d "${IO500_RESULTS_DFUSE_DIR}" ]] + then + log "Unmount DFuse mountpoint ${IO500_RESULTS_DFUSE_DIR}" + pdsh -w ^hosts sudo fusermount3 -u "${IO500_RESULTS_DFUSE_DIR}" + pdsh -w ^hosts rm -rf "${IO500_RESULTS_DFUSE_DIR}" + pdsh -w ^hosts mount | sort | grep dfuse || true + printf "\nfusermount3 complete!\n\n" fi +} +cleanup(){ log "Clean up" - if [[ -d "${IO500_RESULTS_DIR}" ]];then - echo "Unmount DFuse mountpoint ${IO500_RESULTS_DIR}" - pdsh -w ^hosts sudo fusermount -u "${IO500_RESULTS_DIR}" - echo "fusermount complete!" + if [[ -d "${IO500_RESULTS_DFUSE_DIR}" ]] + then + unmount fi source ./clean.sh } -#trap cleanup ERR - log "Prepare for IO500 ${IO500_VERSION_TAG^^} run" +log "Copy install_*.sh files to client instances" +pdcp -w ^hosts install_*.sh ~ + +# Install mpifileutils if not already installed +if [[ ! -d /usr/local/mpifileutils/install/bin ]] +then + printf "\nRun install_mpifileutils.sh on client nodes\n\n" + sudo ./install_mpifileutils.sh + pdsh -w ^hosts_no_first "sudo ./install_mpifileutils.sh" +fi + +# Install IO500 if not already installed +if [[ ! -d "${IO500_DIR}" ]] +then + printf "\nRun install_${IO500_VERSION_TAG,,}.sh on client nodes\n\n" + sudo "./install_${IO500_VERSION_TAG}.sh" + pdsh -w ^hosts_no_first "sudo ./install_${IO500_VERSION_TAG,,}.sh" +fi + cleanup printf "\nCopy SSH keys to client nodes\n\n" @@ -88,18 +117,13 @@ dmg pool query "${POOL_LABEL}" log "Create container: label=${CONT_LABEL}" daos container create --type=POSIX --properties="${DAOS_CONT_REPLICATION_FACTOR}" --label="${CONT_LABEL}" "${POOL_LABEL}" -#export DAOS_CONT_UUID=$(daos -j container create --type=POSIX --properties="${DAOS_CONT_REPLICATION_FACTOR}" --label="${CONT_LABEL}" "${POOL_LABEL}" | jq -r .response.container_uuid) -#echo "DAOS_CONT_UUID:" ${DAOS_CONT_UUID} # Show container properties daos cont get-prop ${POOL_LABEL} ${CONT_LABEL} -export IO500_RESULTS_DIR="${HOME}/io500-${IO500_VERSION_TAG}/results" -pdsh -w ^hosts mkdir -p "${IO500_RESULTS_DIR}" - -log "Use dfuse to mount ${CONT_LABEL} on ${IO500_RESULTS_DIR}" -pdsh -w ^hosts sudo rm -rf "${IO500_RESULTS_DIR}" -pdsh -w ^hosts mkdir -p "${IO500_RESULTS_DIR}" -pdsh -w ^hosts dfuse --pool="${POOL_LABEL}" --container="${CONT_LABEL}" --mountpoint="${IO500_RESULTS_DIR}" +log "Use dfuse to mount ${CONT_LABEL} on ${IO500_RESULTS_DFUSE_DIR}" +pdsh -w ^hosts sudo rm -rf "${IO500_RESULTS_DFUSE_DIR}" +pdsh -w ^hosts mkdir -p "${IO500_RESULTS_DFUSE_DIR}" +pdsh -w ^hosts dfuse --pool="${POOL_LABEL}" --container="${CONT_LABEL}" --mountpoint="${IO500_RESULTS_DFUSE_DIR}" sleep 10 echo "DFuse complete!" @@ -121,18 +145,30 @@ export IO500_NP=$(( ${DAOS_CLIENT_INSTANCE_COUNT} * $(nproc --all) )) cp -f "${IO500_DIR}/config-full-sc21.ini" . envsubst < config-full-sc21.ini > temp.ini -sed -i "s|^resultdir.*|resultdir = ${IO500_RESULTS_DIR}|g" temp.ini +sed -i "s|^resultdir.*|resultdir = ${IO500_RESULTS_DFUSE_DIR}|g" temp.ini sed -i "s/^stonewall-time.*/stonewall-time = ${IO500_STONEWALL_TIME}/g" temp.ini sed -i "s/^transferSize.*/transferSize = 4m/g" temp.ini #sed -i "s/^blockSize.*/blockSize = 1000000m/g" temp.ini # This causes failures sed -i "s/^filePerProc.*/filePerProc = TRUE /g" temp.ini sed -i "s/^nproc.*/nproc = ${IO500_NP}/g" temp.ini +# Prepare final results directory for the current run +TIMESTAMP=$(date "+%Y-%m-%d_%H%M%S") +IO500_RESULTS_DIR_TIMESTAMPED="${IO500_RESULTS_DIR}/${TIMESTAMP}" +mkdir -p "${IO500_RESULTS_DIR_TIMESTAMPED}" + log "Run IO500" mpirun -np ${IO500_NP} \ --hostfile hosts \ --bind-to socket "${IO500_DIR}/io500" temp.ini -cleanup +log "Copy results from ${IO500_RESULTS_DFUSE_DIR} to ${IO500_RESULTS_DIR}" + +rsync -avh "${IO500_RESULTS_DFUSE_DIR}/" "${IO500_RESULTS_DIR_TIMESTAMPED}/" +cp temp.ini "${IO500_RESULTS_DIR_TIMESTAMPED}/" +printenv | sort > "${IO500_RESULTS_DIR_TIMESTAMPED}/env.sh" + +unmount -printf "\nIO500 DONE!\n\n" +printf "IO500 run complete!\n\n" +printf "Results files located in "${IO500_RESULTS_DIR_TIMESTAMPED}"\n\n" diff --git a/terraform/examples/io500/start.sh b/terraform/examples/io500/start.sh index d84ceaf..c254b11 100755 --- a/terraform/examples/io500/start.sh +++ b/terraform/examples/io500/start.sh @@ -6,17 +6,29 @@ trap 'echo "Hit an unexpected and unchecked error. Exiting."' ERR # Load needed variables source ./configure.sh +# Set SSH options for ssh and scp commands +SSH_OPTS="-i id_rsa -o IdentitiesOnly=yes -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null" + log() { - local msg="$1" - printf "\n%80s" | tr " " "-" - printf "\n%s\n" "${msg}" - printf "%80s\n" | tr " " "-" + local msg="| $1 |" + line=$(printf "${msg}" | sed 's/./-/g') + tput setaf 14 # set Cyan color + printf -- "\n${line}\n${msg}\n${line}\n" + tput sgr0 # reset color } +# Build the DAOS disk images if they don't exist in the project +if ! gcloud compute images list | grep -q "${TF_VAR_server_os_family}"; then + log "Building DAOS server image: ${TF_VAR_server_os_family}" + pushd ../../../images + ./make_images.sh "server" + popd +fi + if ! gcloud compute images list | grep -q "${TF_VAR_client_os_family}"; then - log "Building DAOS Server & Client images" + log "Building DAOS client image: ${TF_VAR_client_os_family}" pushd ../../../images - ./make_images.sh + ./make_images.sh "client" popd fi @@ -34,14 +46,14 @@ gcloud compute instance-groups managed wait-until ${TF_VAR_server_template_name} printf "\nAdd external IP to first client\n\n" gcloud compute instances add-access-config ${DAOS_FIRST_CLIENT} --zone ${TF_VAR_zone} && sleep 10 -IP=$(gcloud compute instances describe ${DAOS_FIRST_CLIENT} | grep natIP | awk '{print $2}') +FIRST_CLIENT_IP=$(gcloud compute instances describe ${DAOS_FIRST_CLIENT} | grep natIP | awk '{print $2}') log "Configure SSH access" printf "\nCreate SSH key\n\n" rm -f ./id_rsa* ; ssh-keygen -t rsa -b 4096 -C "${SSH_USER}" -N '' -f id_rsa echo "${SSH_USER}:$(cat id_rsa.pub)" > keys.txt -printf "\nConfiguring SSH on nodes\n\n" +printf "\nConfiguring SSH for user '${SSH_USER}' on all nodes\n\n" for node in $ALL_NODES do # Disable OSLogin to be able to connect with SSH keys uploaded in next command @@ -54,21 +66,24 @@ done wait printf "\nCopy SSH key to first DAOS client\n\n" -scp -i id_rsa -o IdentitiesOnly=yes -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null \ +scp ${SSH_OPTS} \ id_rsa \ id_rsa.pub \ - "${SSH_USER}@${IP}:~/.ssh" -ssh -i id_rsa -o IdentitiesOnly=yes -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null \ - ${SSH_USER}@${IP} \ + "${SSH_USER}@${FIRST_CLIENT_IP}:~/.ssh" +ssh ${SSH_OPTS} ${SSH_USER}@${FIRST_CLIENT_IP} \ "printf 'Host *\n StrictHostKeyChecking no\n IdentityFile ~/.ssh/id_rsa\n' > ~/.ssh/config && \ chmod -R 600 .ssh/*" log "Copy files to first client" -scp -i id_rsa -o IdentitiesOnly=yes -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null \ +scp ${SSH_OPTS} \ clean.sh \ configure.sh \ run_io500-sc21.sh \ - "${SSH_USER}@${IP}:~" + install_io500-sc21.sh \ + install_mpifileutils.sh \ + "${SSH_USER}@${FIRST_CLIENT_IP}:~" + +ssh ${SSH_OPTS} ${SSH_USER}@${FIRST_CLIENT_IP} "chmod +x ~/*.sh && chmod -x ~/configure.sh" log "DAOS servers and clients deployed successfully" gcloud compute instances list --filter="name:daos*" @@ -78,9 +93,9 @@ printf " To run an IO500 benchmark: 1. Log into the first client - ssh -i id_rsa ${SSH_USER}@${IP} + ssh -i id_rsa ${SSH_USER}@${FIRST_CLIENT_IP} -2. Run the script +2. Run IO500 ~/run_io500-sc21.sh "