Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix UCX examples for InfiniBand #556

Merged
merged 3 commits into from
Mar 26, 2021
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 12 additions & 2 deletions examples/ucx/client_initialize.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
import click
import cupy

from dask import array as da
from dask.distributed import Client

from dask_cuda.initialize import initialize
Expand Down Expand Up @@ -27,7 +29,7 @@ def main(
ucx_net_devices = None

if enable_infiniband:
enable_rdmacm = True
# enable_rdmacm = True # RDMACM not working right now
ucx_net_devices = "mlx5_0:1"

# set up environment
Expand All @@ -40,7 +42,15 @@ def main(
)

# initialize client
client = Client(address) # noqa F841
client = Client(address)

# client code here
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is a bit misleading, I would rename that to user code. Saying client code may sound like it's executed on the client-side only.

rs = da.random.RandomState(RandomState=cupy.random.RandomState)
x = rs.random((10000, 10000), chunks=1000)
x.sum().compute()

# shutdown client
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
# shutdown client
# shutdown cluster

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This shuts down the entire cluster, not the client: https://distributed.dask.org/en/latest/api.html#distributed.Client.shutdown .

client.shutdown()


if __name__ == "__main__":
Expand Down
13 changes: 6 additions & 7 deletions examples/ucx/dask_cuda_worker.sh
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,10 @@ usage() {
}

# parse arguments
address=localhost
rmm_pool_size=1GB

while getopts ":a:i:r:t:" flag; do
case "${flag}" in
a) address=${OPTARG};;
i) interface=${OPTARG};;
r) rmm_pool_size=${OPTARG};;
t) transport=${OPTARG};;
Expand All @@ -29,8 +27,8 @@ DASK_UCX__CUDA_COPY=True
DASK_UCX__TCP=True
DASK_RMM__POOL_SIZE=$rmm_pool_size

scheduler_flags="--protocol ucx"
worker_flags="--enable-tcp-over-ucx --rmm-pool-size ${rmm_pool_size}"
scheduler_flags="--scheduler-file scheduler.json --protocol ucx"
worker_flags="--scheduler-file scheduler.json --enable-tcp-over-ucx --rmm-pool-size ${rmm_pool_size}"

if ! [ -z ${interface+x} ]; then
scheduler_flags+=" --interface ${interface}"
Expand All @@ -42,14 +40,15 @@ if [[ $transport == *"nvlink"* ]]; then
fi
if [[ $transport == *"ib"* ]]; then
DASK_UCX__INFINIBAND=True
DASK_UCX__RDMACM=True
# DASK_UCX__RDMACM=True # RDMACM not working right now
DASK_UCX__NET_DEVICES=mlx5_0:1

worker_flags+=" --enable-infiniband --enable-rdmacm --net-devices=auto"
# worker_flags+=" --enable-infiniband --enable-rdmacm --net-devices=auto"
worker_flags+=" --enable-infiniband --net-devices=auto"
fi

# initialize scheduler
dask-scheduler $scheduler_flags &

# initialize workers
dask-cuda-worker ucx://${address}:8786 $worker_flags
dask-cuda-worker $worker_flags
14 changes: 12 additions & 2 deletions examples/ucx/local_cuda_cluster.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
import click
import cupy

from dask import array as da
from dask.distributed import Client
from dask.utils import parse_bytes

Expand Down Expand Up @@ -40,7 +42,7 @@ def main(
ucx_net_devices = None

if enable_infiniband:
enable_rdmacm = True
# enable_rdmacm = True # RDMACM not working right now
ucx_net_devices = "auto"

if (enable_infiniband or enable_nvlink) and not interface:
Expand All @@ -60,7 +62,15 @@ def main(
)

# initialize client
client = Client(cluster) # noqa F841
client = Client(cluster)

# client code here
rs = da.random.RandomState(RandomState=cupy.random.RandomState)
x = rs.random((10000, 10000), chunks=1000)
x.sum().compute()

# shutdown client
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
# shutdown client
# shutdown cluster

client.shutdown()


if __name__ == "__main__":
Expand Down