Skip to content

Commit

Permalink
[dask] Update dask demo for using the new dask backend. (#10347)
Browse files Browse the repository at this point in the history
  • Loading branch information
trivialfis authored May 31, 2024
1 parent e6eefea commit c2e3d4f
Show file tree
Hide file tree
Showing 2 changed files with 29 additions and 23 deletions.
33 changes: 18 additions & 15 deletions demo/dask/gpu_training.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
====================================
"""

import cupy as cp
import dask
import dask_cudf
from dask import array as da
from dask import dataframe as dd
Expand All @@ -24,12 +24,8 @@ def using_dask_matrix(client: Client, X: da.Array, y: da.Array) -> da.Array:
# history obtained from evaluation metrics.
output = dxgb.train(
client,
{
"verbosity": 2,
"tree_method": "hist",
# Golden line for GPU training
"device": "cuda",
},
# Make sure the device is set to CUDA.
{"tree_method": "hist", "device": "cuda"},
dtrain,
num_boost_round=4,
evals=[(dtrain, "train")],
Expand All @@ -50,18 +46,17 @@ def using_quantile_device_dmatrix(client: Client, X: da.Array, y: da.Array) -> d
.. versionadded:: 1.2.0
"""
X = dask_cudf.from_dask_dataframe(dd.from_dask_array(X))
y = dask_cudf.from_dask_dataframe(dd.from_dask_array(y))

# `DaskQuantileDMatrix` is used instead of `DaskDMatrix`, be careful that it can not
# be used for anything else other than training unless a reference is specified. See
# the `ref` argument of `DaskQuantileDMatrix`.
dtrain = dxgb.DaskQuantileDMatrix(client, X, y)
output = dxgb.train(
client,
{"verbosity": 2, "tree_method": "hist", "device": "cuda"},
# Make sure the device is set to CUDA.
{"tree_method": "hist", "device": "cuda"},
dtrain,
num_boost_round=4,
evals=[(dtrain, "train")],
)

prediction = dxgb.predict(client, output, X)
Expand All @@ -72,15 +67,23 @@ def using_quantile_device_dmatrix(client: Client, X: da.Array, y: da.Array) -> d
# `LocalCUDACluster` is used for assigning GPU to XGBoost processes. Here
# `n_workers` represents the number of GPUs since we use one GPU per worker process.
with LocalCUDACluster(n_workers=2, threads_per_worker=4) as cluster:
with Client(cluster) as client:
# generate some random data for demonstration
# Create client from cluster, set the backend to GPU array (cupy).
with Client(cluster) as client, dask.config.set({"array.backend": "cupy"}):
# Generate some random data for demonstration
rng = da.random.default_rng(1)

m = 100000
m = 2**18
n = 100
X = rng.normal(size=(m, n))
X = rng.uniform(size=(m, n), chunks=(128**2, -1))
y = X.sum(axis=1)

X = dd.from_dask_array(X)
y = dd.from_dask_array(y)
# XGBoost can take arrays. This is to show that DataFrame uses the GPU
# backend as well.
assert isinstance(X, dask_cudf.DataFrame)
assert isinstance(y, dask_cudf.Series)

print("Using DaskQuantileDMatrix")
from_ddqdm = using_quantile_device_dmatrix(client, X, y)
print("Using DMatrix")
Expand Down
19 changes: 11 additions & 8 deletions demo/dask/sklearn_gpu_training.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
===================================================================
"""

import dask
from dask import array as da
from dask.distributed import Client

Expand All @@ -13,17 +14,18 @@


def main(client: Client) -> dxgb.Booster:
# generate some random data for demonstration
# Generate some random data for demonstration
rng = da.random.default_rng(1)

m = 2**18
n = 100
m = 1000000
partition_size = 10000
X = da.random.random((m, n), partition_size)
y = da.random.random(m, partition_size)
X = rng.uniform(size=(m, n), chunks=(128**2, -1))
y = X.sum(axis=1)

regressor = dxgb.DaskXGBRegressor(verbosity=1)
# set the device to CUDA
# Set the device to CUDA
regressor.set_params(tree_method="hist", device="cuda")
# assigning client here is optional
# Assigning client here is optional
regressor.client = client

regressor.fit(X, y, eval_set=[(X, y)])
Expand All @@ -42,5 +44,6 @@ def main(client: Client) -> dxgb.Booster:
# With dask cuda, one can scale up XGBoost to arbitrary GPU clusters.
# `LocalCUDACluster` used here is only for demonstration purpose.
with LocalCUDACluster() as cluster:
with Client(cluster) as client:
# Create client from cluster, set the backend to GPU array (cupy).
with Client(cluster) as client, dask.config.set({"array.backend": "cupy"}):
main(client)

0 comments on commit c2e3d4f

Please sign in to comment.