-
Notifications
You must be signed in to change notification settings - Fork 16
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add setup directory with scripts that automate deployment and benchma…
…rking (#7) Signed-off-by: Łukasz Sitkiewicz <lukasz.sitkiewicz@intel.com>
- Loading branch information
Showing
6 changed files
with
310 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,25 @@ | ||
# Setup | ||
|
||
## Deployment | ||
|
||
1. From your PC: | ||
1. Configure variables to your needs in [configure.sh](configure.sh) script | ||
2. Run [start.sh](start.sh) script to deploy DAOS on GCP from your PC | ||
3. SSH to first DAOS client | ||
2. From first DAOS client: | ||
1. Run [setup_io500.sh](setup_io500.sh) to finish DAOS environment configuration and benchmark it with IO500 | ||
- you can run this script multiple times to do several IO500 benchmarks | ||
3. From your PC: | ||
1. Run [stop.sh](stop.sh) to destroy DAOS environment on GCP | ||
|
||
## Scripts definition | ||
|
||
[configure.sh](configure.sh) script has all the DAOS configuration that you need to adjust to your needs. It is sourced it other scripts. | ||
|
||
[start.sh](start.sh) script is used to deploy DAOS instances on GCP | ||
|
||
[setup_io500.sh](setup_io500.sh) script is used to finish DAOS environment configuration and benchmark it with IO500 | ||
|
||
[stop.sh](stop.sh) script is used to destroy DAOS instances on GCP | ||
|
||
[clean.sh](clean.sh) script is used clean DAOS environment to run another IO500 benchmark on the same environment and reconfigure DAOS server configuration |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,26 @@ | ||
#!/bin/bash | ||
|
||
set -e | ||
trap 'echo "Hit an unexpected and unchecked error. Exiting."' ERR | ||
|
||
# Load needed variables | ||
source ./configure.sh | ||
|
||
for server in ${SERVERS} | ||
do | ||
echo "#######################" | ||
echo "# Cleaning ${server}" | ||
echo "#######################" | ||
ssh ${server} "rm -f .ssh/known_hosts" | ||
ssh ${server} "sudo systemctl stop daos_server" | ||
ssh ${server} "sudo rm -rf /var/daos/ram/*" | ||
ssh ${server} "sudo umount /var/daos/ram/ && echo success || echo unmounted" | ||
ssh ${server} "sudo sed -i \"s/^crt_timeout:.*/crt_timeout: ${CRT_TIMEOUT}/g\" /etc/daos/daos_server.yml" | ||
ssh ${server} "sudo sed -i \"s/^ targets:.*/ targets: ${DAOS_DISK_COUNT}/g\" /etc/daos/daos_server.yml" | ||
ssh ${server} "sudo sed -i \"s/^ scm_size:.*/ scm_size: ${SCM_SIZE}/g\" /etc/daos/daos_server.yml" | ||
ssh ${server} "cat /etc/daos/daos_server.yml" | ||
ssh ${server} "sudo systemctl start daos_server" | ||
sleep 4 | ||
ssh ${server} "sudo systemctl status daos_server" | ||
echo "Done" | ||
done |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,81 @@ | ||
#!/bin/bash | ||
|
||
# Configure below variables to your needs | ||
#-------------------------------------------------------- | ||
ID="" # Identifier for deploying multiple environments in GCP | ||
PREEMPTIBLE_INSTANCES="true" | ||
NUMBER_OF_SERVERS_INSTANCES="1" | ||
DAOS_DISK_COUNT=8 | ||
NUMBER_OF_CLIENTS_INSTANCES="1" | ||
SERVER_MACHINE_TYPE=n2-highmem-32 # n2-custom-20-131072 n2-custom-40-262144 n2-highmem-32 n2-standard-2 | ||
CLIENT_MACHINE_TYPE=c2-standard-16 # c2-standard-16 n2-standard-2 | ||
CRT_TIMEOUT=300 | ||
SCM_SIZE=100 | ||
STONEWALL_TIME=3 | ||
POOL_SIZE="$(( 375 * ${DAOS_DISK_COUNT} * ${NUMBER_OF_SERVERS_INSTANCES} / 1000 ))TB" | ||
CONTAINER_REPLICATION_FACTOR="rf:0" | ||
SSH_USER="daos-user" | ||
|
||
# Terraform environmental variables | ||
export TF_VAR_project_id="" | ||
export TF_VAR_network="default" | ||
export TF_VAR_subnetwork="default" | ||
export TF_VAR_subnetwork_project="${TF_VAR_project_id}" | ||
export TF_VAR_region="us-central1" | ||
export TF_VAR_zone="us-central1-f" | ||
export TF_VAR_preemptible="${PREEMPTIBLE_INSTANCES}" | ||
# Servers | ||
export TF_VAR_server_number_of_instances=${NUMBER_OF_SERVERS_INSTANCES} | ||
export TF_VAR_server_daos_disk_count=${DAOS_DISK_COUNT} | ||
export TF_VAR_server_instance_base_name="daos-server-${ID}" | ||
export TF_VAR_server_os_disk_size_gb=20 | ||
export TF_VAR_server_os_disk_type="pd-ssd" | ||
export TF_VAR_server_template_name="daos-server-${ID}" | ||
export TF_VAR_server_mig_name="daos-server-${ID}" | ||
export TF_VAR_server_machine_type="${SERVER_MACHINE_TYPE}" | ||
export TF_VAR_server_os_project="${TF_VAR_project_id}" | ||
export TF_VAR_server_os_family="daos-server" | ||
# Clients | ||
export TF_VAR_client_number_of_instances=${NUMBER_OF_CLIENTS_INSTANCES} | ||
export TF_VAR_client_instance_base_name="daos-client-${ID}" | ||
export TF_VAR_client_os_disk_size_gb=20 | ||
export TF_VAR_client_os_disk_type="pd-ssd" | ||
export TF_VAR_client_template_name="daos-client-${ID}" | ||
export TF_VAR_client_mig_name="daos-client-${ID}" | ||
export TF_VAR_client_machine_type="${CLIENT_MACHINE_TYPE}" | ||
export TF_VAR_client_os_project="${TF_VAR_project_id}" | ||
export TF_VAR_client_os_family="daos-client" | ||
|
||
####################### | ||
# Create hosts file # | ||
####################### | ||
|
||
CLIENT_NAME="daos-client-${ID}" | ||
SERVER_NAME="daos-server-${ID}" | ||
|
||
rm -f hosts | ||
unset CLIENTS | ||
unset SERVERS | ||
unset ALL_NODES | ||
|
||
for ((i=1; i <= ${NUMBER_OF_CLIENTS_INSTANCES} ; i++)) | ||
do | ||
CLIENTS+="${CLIENT_NAME}-$(printf %04d ${i}) " | ||
echo ${CLIENT_NAME}-$(printf %04d ${i})>>hosts | ||
done | ||
|
||
for ((i=1; i <= ${NUMBER_OF_SERVERS_INSTANCES} ; i++)) | ||
do | ||
SERVERS+="${SERVER_NAME}-$(printf %04d ${i}) " | ||
done | ||
|
||
ALL_NODES="${SERVERS} ${CLIENTS}" | ||
export ALL_NODES | ||
|
||
export SERVERS | ||
export CLIENTS | ||
|
||
DAOS_FIRST_SERVER=$(echo ${SERVERS} | awk '{print $1}') | ||
DAOS_FIRST_CLIENT=$(echo ${CLIENTS} | awk '{print $1}') | ||
|
||
SERVERS_LIST_WITH_COMMA=$(echo ${SERVERS} | tr ' ' ',') |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,90 @@ | ||
#!/bin/bash | ||
|
||
set -e | ||
trap 'echo "Hit an unexpected and unchecked error. Exiting."' ERR | ||
|
||
# Load needed variables | ||
source ./configure.sh | ||
|
||
# Clean and configure DAOS servers | ||
source ./clean.sh | ||
|
||
echo "Copy agent config files from server" | ||
rm -f .ssh/known_hosts | ||
scp ${DAOS_FIRST_SERVER}:/etc/daos/daos_agent.yml . | ||
scp ${DAOS_FIRST_SERVER}:/etc/daos/daos_control.yml . | ||
|
||
echo "Configure DAOS Clients" | ||
pdsh -w ^hosts rm -f .ssh/known_hosts | ||
pdsh -w ^hosts sudo systemctl stop daos_agent | ||
pdcp -w ^hosts daos_agent.yml daos_control.yml ~ | ||
pdsh -w ^hosts sudo cp daos_agent.yml daos_control.yml /etc/daos/ | ||
pdsh -w ^hosts sudo systemctl start daos_agent | ||
|
||
echo "Format DAOS" | ||
dmg -i -l ${SERVERS_LIST_WITH_COMMA} storage scan --verbose | ||
dmg -i -l ${SERVERS_LIST_WITH_COMMA} storage format --reformat | ||
|
||
echo "Wait for DAOS storage reformat to finish" | ||
printf "Waiting" | ||
while true | ||
do | ||
if [ $(dmg -i -j system query -v | grep joined | wc -l) -eq ${NUMBER_OF_SERVERS_INSTANCES} ] | ||
then | ||
echo "Done" | ||
dmg -i system query -v | ||
break | ||
fi | ||
printf "." | ||
sleep 10 | ||
done | ||
|
||
echo "Create DAOS Pool ${POOL_SIZE}" | ||
export DAOS_POOL=$(dmg -i -j pool create -z ${POOL_SIZE} -t 3 -u ${USER} | jq -r .response.uuid) | ||
echo "DAOS_POOL:" ${DAOS_POOL} | ||
# Show information about a created pool | ||
dmg pool query --pool ${DAOS_POOL} | ||
# Modify a pool's DAOS_PO_RECLAIM reclaim strategies property to never trigger aggregation | ||
dmg -i pool set-prop --pool ${DAOS_POOL} --name=reclaim --value=disabled | ||
|
||
echo "Create DAOS Pool container" | ||
export DAOS_CONT=$(daos container create --type POSIX --pool $DAOS_POOL --properties ${CONTAINER_REPLICATION_FACTOR} | egrep -o '[0-9a-f-]{36}$') | ||
echo "DAOS_CONT:" ${DAOS_CONT} | ||
# Show container properties | ||
daos cont get-prop --pool ${DAOS_POOL} --cont ${DAOS_CONT} | ||
|
||
echo "Mount with DFuse DAOS pool to OS" | ||
export DAOS_FUSE=${HOME}/io500/results | ||
pdsh -w ^hosts mkdir -p ${DAOS_FUSE} | ||
pdsh -w ^hosts dfuse --pool=${DAOS_POOL} --container=${DAOS_CONT} -m ${DAOS_FUSE} | ||
sleep 10 | ||
echo "DFuse complete!" | ||
|
||
echo "Export needed ENVs" | ||
export I_MPI_OFI_LIBRARY_INTERNAL=0 | ||
export I_MPI_OFI_PROVIDER="tcp;ofi_rxm" | ||
export FI_OFI_RXM_USE_SRX=1 | ||
export FI_UNIVERSE_SIZE=16383 | ||
source /opt/intel/oneapi/setvars.sh | ||
export PATH=$PATH:/usr/local/io500/bin | ||
export LD_LIBRARY_PATH=/usr/local/mpifileutils/install/lib64/ | ||
|
||
echo "Prepare config file for IO500" | ||
cp /usr/local/io500/config-full.ini . | ||
envsubst < config-full.ini > temp.ini | ||
sed -i "s/^stonewall-time.*/stonewall-time = ${STONEWALL_TIME}/g" temp.ini | ||
sed -i "s/^transferSize.*/transferSize = 4m/g" temp.ini | ||
sed -i "s/^blockSize.*/blockSize = 1000000m/g" temp.ini | ||
sed -i "s/^filePerProc.*/filePerProc = TRUE /g" temp.ini | ||
sed -i "s/^nproc.*/nproc = $(( ${NUMBER_OF_CLIENTS_INSTANCES} * $(nproc --all) ))/g" temp.ini | ||
|
||
# Run IO500 benchmark | ||
mpirun --hostfile hosts -env I_MPI_OFI_PROVIDER="tcp;ofi_rxm" --bind-to socket -np $(( ${NUMBER_OF_CLIENTS_INSTANCES} * $(nproc --all) )) /usr/local/io500/io500 temp.ini | ||
|
||
echo "Cleaning up after run ..." | ||
echo "Unmount DFuse mountpoint" | ||
pdsh -w ^hosts sudo fusermount -u ${DAOS_FUSE} | ||
echo "fusermount complete!" | ||
echo "Delete DAOS pool" | ||
res=$(dmg -i pool destroy --pool ${DAOS_POOL}) | ||
echo "dmg says: " $res |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,73 @@ | ||
#!/bin/bash | ||
|
||
set -e | ||
trap 'echo "Hit an unexpected and unchecked error. Exiting."' ERR | ||
|
||
# Load needed variables | ||
source ./configure.sh | ||
|
||
if [ ! -f images_were_built.flag ] | ||
then | ||
echo "##########################" | ||
echo "# Building DAOS images #" | ||
echo "##########################" | ||
pushd ../../../images | ||
./make_images.sh | ||
popd | ||
touch images_were_built.flag | ||
fi | ||
|
||
echo "######################################" | ||
echo "# Deploying DAOS Servers & Clients #" | ||
echo "######################################" | ||
pushd ../full_cluster_setup | ||
terraform init -input=false | ||
terraform plan -out=tfplan -input=false | ||
terraform apply -input=false tfplan | ||
popd | ||
|
||
echo "# Wait for instances" | ||
sleep 10 | ||
|
||
echo "# Add external IP to first client, so that it will be accessible over normal SSH" | ||
gcloud compute instances add-access-config ${DAOS_FIRST_CLIENT} --zone ${TF_VAR_zone} && sleep 10 | ||
IP=$(gcloud compute instances describe ${DAOS_FIRST_CLIENT} | grep natIP | awk '{print $2}') | ||
|
||
echo "##########################" | ||
echo "# Configure SSH access #" | ||
echo "##########################" | ||
echo "# Prepare SSH key" | ||
rm -f ./id_rsa* ; ssh-keygen -t rsa -b 4096 -C "root" -N '' -f id_rsa | ||
echo "${SSH_USER}:$(cat id_rsa.pub)" > keys.txt | ||
|
||
for node in $ALL_NODES | ||
do | ||
echo "# Configuring SSH on ${node}" | ||
# Disable OSLogin to be able to connect with SSH keys uploaded in next command | ||
gcloud compute instances add-metadata ${node} --metadata enable-oslogin=FALSE | ||
# Upload SSH key to instance, so that you could login to instance over SSH | ||
gcloud compute instances add-metadata ${node} --metadata-from-file ssh-keys=keys.txt | ||
done | ||
|
||
echo "# Copy SSH key to first DAOS client" | ||
scp -i id_rsa -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null \ | ||
id_rsa \ | ||
id_rsa.pub \ | ||
"${SSH_USER}@${IP}:~/.ssh" | ||
ssh -i id_rsa -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null \ | ||
${SSH_USER}@${IP} \ | ||
"printf 'Host *\n StrictHostKeyChecking no\n IdentityFile ~/.ssh/id_rsa\n' > ~/.ssh/config && \ | ||
chmod -R 600 .ssh/*" | ||
|
||
echo "# Copy files" | ||
scp -i id_rsa -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null \ | ||
clean.sh \ | ||
configure.sh \ | ||
setup_io500.sh \ | ||
"${SSH_USER}@${IP}:~" | ||
|
||
echo "#########################################################################" | ||
echo "# Now run setup_io500.sh script on ${DAOS_FIRST_CLIENT}" | ||
echo "# SSH to it using this command:" | ||
echo "# ssh -i id_rsa ${SSH_USER}@${IP}" | ||
echo "#########################################################################" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,15 @@ | ||
#!/bin/bash | ||
|
||
set -e | ||
trap 'echo "Hit an unexpected and unchecked error. Exiting."' ERR | ||
|
||
# Load needed variables | ||
source ./configure.sh | ||
|
||
echo "####################################" | ||
echo "# Destroying DAOS Servers & Clients" | ||
echo "####################################" | ||
|
||
pushd ../full_cluster_setup | ||
terraform destroy -auto-approve | ||
popd |