Skip to content

Commit

Permalink
Add setup directory with scripts that automate deployment and benchma…
Browse files Browse the repository at this point in the history
…rking (#7)

Signed-off-by: Łukasz Sitkiewicz <lukasz.sitkiewicz@intel.com>
  • Loading branch information
lsitkiew authored Oct 18, 2021
1 parent 8b4c466 commit 03f2c8d
Show file tree
Hide file tree
Showing 6 changed files with 310 additions and 0 deletions.
25 changes: 25 additions & 0 deletions terraform/examples/io500/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
# Setup

## Deployment

1. From your PC:
1. Configure variables to your needs in [configure.sh](configure.sh) script
2. Run [start.sh](start.sh) script to deploy DAOS on GCP from your PC
3. SSH to first DAOS client
2. From first DAOS client:
1. Run [setup_io500.sh](setup_io500.sh) to finish DAOS environment configuration and benchmark it with IO500
- you can run this script multiple times to do several IO500 benchmarks
3. From your PC:
1. Run [stop.sh](stop.sh) to destroy DAOS environment on GCP

## Scripts definition

[configure.sh](configure.sh) script has all the DAOS configuration that you need to adjust to your needs. It is sourced it other scripts.

[start.sh](start.sh) script is used to deploy DAOS instances on GCP

[setup_io500.sh](setup_io500.sh) script is used to finish DAOS environment configuration and benchmark it with IO500

[stop.sh](stop.sh) script is used to destroy DAOS instances on GCP

[clean.sh](clean.sh) script is used clean DAOS environment to run another IO500 benchmark on the same environment and reconfigure DAOS server configuration
26 changes: 26 additions & 0 deletions terraform/examples/io500/clean.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
#!/bin/bash

set -e
trap 'echo "Hit an unexpected and unchecked error. Exiting."' ERR

# Load needed variables
source ./configure.sh

for server in ${SERVERS}
do
echo "#######################"
echo "# Cleaning ${server}"
echo "#######################"
ssh ${server} "rm -f .ssh/known_hosts"
ssh ${server} "sudo systemctl stop daos_server"
ssh ${server} "sudo rm -rf /var/daos/ram/*"
ssh ${server} "sudo umount /var/daos/ram/ && echo success || echo unmounted"
ssh ${server} "sudo sed -i \"s/^crt_timeout:.*/crt_timeout: ${CRT_TIMEOUT}/g\" /etc/daos/daos_server.yml"
ssh ${server} "sudo sed -i \"s/^ targets:.*/ targets: ${DAOS_DISK_COUNT}/g\" /etc/daos/daos_server.yml"
ssh ${server} "sudo sed -i \"s/^ scm_size:.*/ scm_size: ${SCM_SIZE}/g\" /etc/daos/daos_server.yml"
ssh ${server} "cat /etc/daos/daos_server.yml"
ssh ${server} "sudo systemctl start daos_server"
sleep 4
ssh ${server} "sudo systemctl status daos_server"
echo "Done"
done
81 changes: 81 additions & 0 deletions terraform/examples/io500/configure.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
#!/bin/bash

# Configure below variables to your needs
#--------------------------------------------------------
ID="" # Identifier for deploying multiple environments in GCP
PREEMPTIBLE_INSTANCES="true"
NUMBER_OF_SERVERS_INSTANCES="1"
DAOS_DISK_COUNT=8
NUMBER_OF_CLIENTS_INSTANCES="1"
SERVER_MACHINE_TYPE=n2-highmem-32 # n2-custom-20-131072 n2-custom-40-262144 n2-highmem-32 n2-standard-2
CLIENT_MACHINE_TYPE=c2-standard-16 # c2-standard-16 n2-standard-2
CRT_TIMEOUT=300
SCM_SIZE=100
STONEWALL_TIME=3
POOL_SIZE="$(( 375 * ${DAOS_DISK_COUNT} * ${NUMBER_OF_SERVERS_INSTANCES} / 1000 ))TB"
CONTAINER_REPLICATION_FACTOR="rf:0"
SSH_USER="daos-user"

# Terraform environmental variables
export TF_VAR_project_id=""
export TF_VAR_network="default"
export TF_VAR_subnetwork="default"
export TF_VAR_subnetwork_project="${TF_VAR_project_id}"
export TF_VAR_region="us-central1"
export TF_VAR_zone="us-central1-f"
export TF_VAR_preemptible="${PREEMPTIBLE_INSTANCES}"
# Servers
export TF_VAR_server_number_of_instances=${NUMBER_OF_SERVERS_INSTANCES}
export TF_VAR_server_daos_disk_count=${DAOS_DISK_COUNT}
export TF_VAR_server_instance_base_name="daos-server-${ID}"
export TF_VAR_server_os_disk_size_gb=20
export TF_VAR_server_os_disk_type="pd-ssd"
export TF_VAR_server_template_name="daos-server-${ID}"
export TF_VAR_server_mig_name="daos-server-${ID}"
export TF_VAR_server_machine_type="${SERVER_MACHINE_TYPE}"
export TF_VAR_server_os_project="${TF_VAR_project_id}"
export TF_VAR_server_os_family="daos-server"
# Clients
export TF_VAR_client_number_of_instances=${NUMBER_OF_CLIENTS_INSTANCES}
export TF_VAR_client_instance_base_name="daos-client-${ID}"
export TF_VAR_client_os_disk_size_gb=20
export TF_VAR_client_os_disk_type="pd-ssd"
export TF_VAR_client_template_name="daos-client-${ID}"
export TF_VAR_client_mig_name="daos-client-${ID}"
export TF_VAR_client_machine_type="${CLIENT_MACHINE_TYPE}"
export TF_VAR_client_os_project="${TF_VAR_project_id}"
export TF_VAR_client_os_family="daos-client"

#######################
# Create hosts file #
#######################

CLIENT_NAME="daos-client-${ID}"
SERVER_NAME="daos-server-${ID}"

rm -f hosts
unset CLIENTS
unset SERVERS
unset ALL_NODES

for ((i=1; i <= ${NUMBER_OF_CLIENTS_INSTANCES} ; i++))
do
CLIENTS+="${CLIENT_NAME}-$(printf %04d ${i}) "
echo ${CLIENT_NAME}-$(printf %04d ${i})>>hosts
done

for ((i=1; i <= ${NUMBER_OF_SERVERS_INSTANCES} ; i++))
do
SERVERS+="${SERVER_NAME}-$(printf %04d ${i}) "
done

ALL_NODES="${SERVERS} ${CLIENTS}"
export ALL_NODES

export SERVERS
export CLIENTS

DAOS_FIRST_SERVER=$(echo ${SERVERS} | awk '{print $1}')
DAOS_FIRST_CLIENT=$(echo ${CLIENTS} | awk '{print $1}')

SERVERS_LIST_WITH_COMMA=$(echo ${SERVERS} | tr ' ' ',')
90 changes: 90 additions & 0 deletions terraform/examples/io500/setup_io500.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
#!/bin/bash

set -e
trap 'echo "Hit an unexpected and unchecked error. Exiting."' ERR

# Load needed variables
source ./configure.sh

# Clean and configure DAOS servers
source ./clean.sh

echo "Copy agent config files from server"
rm -f .ssh/known_hosts
scp ${DAOS_FIRST_SERVER}:/etc/daos/daos_agent.yml .
scp ${DAOS_FIRST_SERVER}:/etc/daos/daos_control.yml .

echo "Configure DAOS Clients"
pdsh -w ^hosts rm -f .ssh/known_hosts
pdsh -w ^hosts sudo systemctl stop daos_agent
pdcp -w ^hosts daos_agent.yml daos_control.yml ~
pdsh -w ^hosts sudo cp daos_agent.yml daos_control.yml /etc/daos/
pdsh -w ^hosts sudo systemctl start daos_agent

echo "Format DAOS"
dmg -i -l ${SERVERS_LIST_WITH_COMMA} storage scan --verbose
dmg -i -l ${SERVERS_LIST_WITH_COMMA} storage format --reformat

echo "Wait for DAOS storage reformat to finish"
printf "Waiting"
while true
do
if [ $(dmg -i -j system query -v | grep joined | wc -l) -eq ${NUMBER_OF_SERVERS_INSTANCES} ]
then
echo "Done"
dmg -i system query -v
break
fi
printf "."
sleep 10
done

echo "Create DAOS Pool ${POOL_SIZE}"
export DAOS_POOL=$(dmg -i -j pool create -z ${POOL_SIZE} -t 3 -u ${USER} | jq -r .response.uuid)
echo "DAOS_POOL:" ${DAOS_POOL}
# Show information about a created pool
dmg pool query --pool ${DAOS_POOL}
# Modify a pool's DAOS_PO_RECLAIM reclaim strategies property to never trigger aggregation
dmg -i pool set-prop --pool ${DAOS_POOL} --name=reclaim --value=disabled

echo "Create DAOS Pool container"
export DAOS_CONT=$(daos container create --type POSIX --pool $DAOS_POOL --properties ${CONTAINER_REPLICATION_FACTOR} | egrep -o '[0-9a-f-]{36}$')
echo "DAOS_CONT:" ${DAOS_CONT}
# Show container properties
daos cont get-prop --pool ${DAOS_POOL} --cont ${DAOS_CONT}

echo "Mount with DFuse DAOS pool to OS"
export DAOS_FUSE=${HOME}/io500/results
pdsh -w ^hosts mkdir -p ${DAOS_FUSE}
pdsh -w ^hosts dfuse --pool=${DAOS_POOL} --container=${DAOS_CONT} -m ${DAOS_FUSE}
sleep 10
echo "DFuse complete!"

echo "Export needed ENVs"
export I_MPI_OFI_LIBRARY_INTERNAL=0
export I_MPI_OFI_PROVIDER="tcp;ofi_rxm"
export FI_OFI_RXM_USE_SRX=1
export FI_UNIVERSE_SIZE=16383
source /opt/intel/oneapi/setvars.sh
export PATH=$PATH:/usr/local/io500/bin
export LD_LIBRARY_PATH=/usr/local/mpifileutils/install/lib64/

echo "Prepare config file for IO500"
cp /usr/local/io500/config-full.ini .
envsubst < config-full.ini > temp.ini
sed -i "s/^stonewall-time.*/stonewall-time = ${STONEWALL_TIME}/g" temp.ini
sed -i "s/^transferSize.*/transferSize = 4m/g" temp.ini
sed -i "s/^blockSize.*/blockSize = 1000000m/g" temp.ini
sed -i "s/^filePerProc.*/filePerProc = TRUE /g" temp.ini
sed -i "s/^nproc.*/nproc = $(( ${NUMBER_OF_CLIENTS_INSTANCES} * $(nproc --all) ))/g" temp.ini

# Run IO500 benchmark
mpirun --hostfile hosts -env I_MPI_OFI_PROVIDER="tcp;ofi_rxm" --bind-to socket -np $(( ${NUMBER_OF_CLIENTS_INSTANCES} * $(nproc --all) )) /usr/local/io500/io500 temp.ini

echo "Cleaning up after run ..."
echo "Unmount DFuse mountpoint"
pdsh -w ^hosts sudo fusermount -u ${DAOS_FUSE}
echo "fusermount complete!"
echo "Delete DAOS pool"
res=$(dmg -i pool destroy --pool ${DAOS_POOL})
echo "dmg says: " $res
73 changes: 73 additions & 0 deletions terraform/examples/io500/start.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
#!/bin/bash

set -e
trap 'echo "Hit an unexpected and unchecked error. Exiting."' ERR

# Load needed variables
source ./configure.sh

if [ ! -f images_were_built.flag ]
then
echo "##########################"
echo "# Building DAOS images #"
echo "##########################"
pushd ../../../images
./make_images.sh
popd
touch images_were_built.flag
fi

echo "######################################"
echo "# Deploying DAOS Servers & Clients #"
echo "######################################"
pushd ../full_cluster_setup
terraform init -input=false
terraform plan -out=tfplan -input=false
terraform apply -input=false tfplan
popd

echo "# Wait for instances"
sleep 10

echo "# Add external IP to first client, so that it will be accessible over normal SSH"
gcloud compute instances add-access-config ${DAOS_FIRST_CLIENT} --zone ${TF_VAR_zone} && sleep 10
IP=$(gcloud compute instances describe ${DAOS_FIRST_CLIENT} | grep natIP | awk '{print $2}')

echo "##########################"
echo "# Configure SSH access #"
echo "##########################"
echo "# Prepare SSH key"
rm -f ./id_rsa* ; ssh-keygen -t rsa -b 4096 -C "root" -N '' -f id_rsa
echo "${SSH_USER}:$(cat id_rsa.pub)" > keys.txt

for node in $ALL_NODES
do
echo "# Configuring SSH on ${node}"
# Disable OSLogin to be able to connect with SSH keys uploaded in next command
gcloud compute instances add-metadata ${node} --metadata enable-oslogin=FALSE
# Upload SSH key to instance, so that you could login to instance over SSH
gcloud compute instances add-metadata ${node} --metadata-from-file ssh-keys=keys.txt
done

echo "# Copy SSH key to first DAOS client"
scp -i id_rsa -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null \
id_rsa \
id_rsa.pub \
"${SSH_USER}@${IP}:~/.ssh"
ssh -i id_rsa -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null \
${SSH_USER}@${IP} \
"printf 'Host *\n StrictHostKeyChecking no\n IdentityFile ~/.ssh/id_rsa\n' > ~/.ssh/config && \
chmod -R 600 .ssh/*"

echo "# Copy files"
scp -i id_rsa -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null \
clean.sh \
configure.sh \
setup_io500.sh \
"${SSH_USER}@${IP}:~"

echo "#########################################################################"
echo "# Now run setup_io500.sh script on ${DAOS_FIRST_CLIENT}"
echo "# SSH to it using this command:"
echo "# ssh -i id_rsa ${SSH_USER}@${IP}"
echo "#########################################################################"
15 changes: 15 additions & 0 deletions terraform/examples/io500/stop.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
#!/bin/bash

set -e
trap 'echo "Hit an unexpected and unchecked error. Exiting."' ERR

# Load needed variables
source ./configure.sh

echo "####################################"
echo "# Destroying DAOS Servers & Clients"
echo "####################################"

pushd ../full_cluster_setup
terraform destroy -auto-approve
popd

0 comments on commit 03f2c8d

Please sign in to comment.