Skip to content

Commit

Permalink
DAOSGCP-224 Updates required for HPC Toolkit hpc-slurm6-daos example (#…
Browse files Browse the repository at this point in the history
…99)

Added retry logic for daos-* package installs

Commented out hard coded network interface config in daos_client.yml since
slurm instances do not force eth0 name for primary ethernet interface.

Signed-off-by: Mark Olson <115657904+mark-olson@users.noreply.github.com>
  • Loading branch information
mark-olson authored Jan 16, 2024
1 parent a7393a9 commit 75c61db
Show file tree
Hide file tree
Showing 2 changed files with 29 additions and 12 deletions.
31 changes: 24 additions & 7 deletions terraform/modules/daos_server/scripts/client_install.sh
Original file line number Diff line number Diff line change
Expand Up @@ -59,18 +59,35 @@ install_epel() {
add_daos_repo() {
local repo_file="${REPO_PATH}/daos.repo"
rm -f "${repo_file}"
echo "Adding DAOS v${DAOS_VERSION} packages repo"
echo "Downloading DAOS v${DAOS_VERSION} packages repo file"
curl -s -k --output "${repo_file}" "https://packages.daos.io/v${DAOS_VERSION}/${DAOS_OS_VERSION}/packages/x86_64/daos_packages.repo"
if [[ "${OS_VERSION_ID}" == "opensuse-leap_15" ]]; then
sed -i 's|gpgkey=.*|gpgkey=https://packages.daos.io/RPM-GPG-KEY|g' "${repo_file}"
if [[ -f ${repo_file} ]]; then
echo "Download of DAOS v${DAOS_VERSION} packages repo file was successful"
else
echo "Download of DAOS v${DAOS_VERSION} packages repo file failed. Exiting."
exit 1
fi
}

install_daos_client() {
"${PKG_MGR}" install -y daos-client daos-devel
# Disable daos_agent service.
# It will be enabled by a startup script after the service has been configured.
systemctl disable daos_agent
local max_attempts=5
local attempt_num=1
local success="false"
while [[ "${success}" == "false" ]] && [ $attempt_num -le $max_attempts ]; do
echo "Installing DAOS v${DAOS_VERSION} admin, client, and develop packages. Attempt: ${attempt_num}"
"${PKG_MGR}" install -y daos-admin daos-client daos-devel
if [[ $? -eq 0 ]]; then
success="true"
echo "DAOS admin, client, and develop packages installed successfully"
echo "Disabling daos_agent service"
echo "daos_agent service will be enabled after the service has been configured."
systemctl disable daos_agent
else
echo "DAOS client install attempt ${attempt_num} failed. Sleeping for 30 seconds before retry ..."
((attempt_num++))
sleep 30
fi
done
}

main() {
Expand Down
10 changes: 5 additions & 5 deletions terraform/modules/daos_server/templates/daos_agent.yml.tftpl
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,8 @@ transport_config:
key: /etc/daos/certs/agent.key
%{ endif }

fabric_ifaces:
- numa_node: 0
devices:
- iface: eth0
domain: eth0
# fabric_ifaces:
# - numa_node: 0
# devices:
# - iface: eth0
# domain: eth0

0 comments on commit 75c61db

Please sign in to comment.