Skip to content

Commit

Permalink
[Data] Update batch inference release tests (ray-project#49012)
Browse files Browse the repository at this point in the history
Other than minor cleanups, this PR:
* Makes batch inference release tests use autoscaling, to ensure Ray Data works well with autoscaling
* Removes the "raw images" variants, because there isn't much value to them in addition to the parquet tests, and people use parquet more often)
* Updates the node types, for consistency with the other release test compute

---------

Signed-off-by: Balaji Veeramani <bveeramani@berkeley.edu>
Signed-off-by: Connor Sanders <connor@elastiflow.com>
  • Loading branch information
bveeramani authored and jecsand838 committed Dec 4, 2024
1 parent d41c843 commit f4f2785
Show file tree
Hide file tree
Showing 6 changed files with 76 additions and 206 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
cloud_id: {{env["ANYSCALE_CLOUD_ID"]}}
region: us-west-2

max_workers: 0
max_workers: 10

head_node_type:
name: head_node
Expand Down
19 changes: 19 additions & 0 deletions release/nightly_tests/dataset/autoscaling_gpu_compute.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
# This config matches the default config for Anyscale workspaces with autoscaling,
# except instead of using CPU instances, it uses GPU instances.
cloud_id: {{env["ANYSCALE_CLOUD_ID"]}}
region: us-west-2

max_workers: 10

head_node_type:
name: head_node
instance_type: m5.2xlarge
resources:
cpu: 0

worker_node_types:
- name: worker_node
instance_type: g4dn.2xlarge
min_workers: 0
max_workers: 10
use_spot: false
23 changes: 23 additions & 0 deletions release/nightly_tests/dataset/autoscaling_hetero_compute.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
cloud_id: {{env["ANYSCALE_CLOUD_ID"]}}
region: us-west-2

max_workers: 20

head_node_type:
name: head_node
instance_type: m5.2xlarge
resources:
cpu: 0

worker_node_types:
- name: worker_node_gpu
instance_type: g4dn.2xlarge
min_workers: 0
max_workers: 10
use_spot: false

- name: worker_node_cpu
instance_type: m5.2xlarge
min_workers: 0
max_workers: 10
use_spot: false
21 changes: 0 additions & 21 deletions release/nightly_tests/dataset/compute_hetero_10x10_aws.yaml

This file was deleted.

7 changes: 4 additions & 3 deletions release/nightly_tests/dataset/gpu_batch_inference.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,16 +88,17 @@ def __call__(self, batch: Dict[str, np.ndarray]) -> Dict[str, np.ndarray]:
start_time_without_metadata_fetching = time.time()

if smoke_test:
actor_pool_size = 4
compute = ActorPoolStrategy(size=4)
num_gpus = 0
else:
actor_pool_size = int(ray.cluster_resources().get("GPU"))
# Autoscale to use as many GPUs as possible.
compute = ActorPoolStrategy(min_size=1, max_size=None)
num_gpus = 1
ds = ds.map_batches(preprocess)
ds = ds.map_batches(
Predictor,
batch_size=BATCH_SIZE,
compute=ActorPoolStrategy(size=actor_pool_size),
compute=compute,
num_gpus=num_gpus,
fn_constructor_kwargs={"model": model_ref},
max_concurrency=2,
Expand Down
210 changes: 29 additions & 181 deletions release/release_data_tests.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,9 @@

cluster:
byod:
# 'type: gpu' means: use the 'ray-ml' image.
type: gpu
cluster_compute: multi_node_autoscaling_compute.yaml
cluster_compute: autoscaling_cpu_compute.yaml

###############
# Reading tests
Expand All @@ -33,31 +34,6 @@
timeout: 600
script: python read_and_consume_benchmark.py s3://ray-benchmark-data/parquet/10TiB --format parquet --count

- name: stable_diffusion_benchmark
group: data-tests
working_dir: nightly_tests/dataset

frequency: nightly
team: data

cluster:
byod:
type: gpu
post_build_script: byod_stable_diffusion.sh
cluster_compute: stable_diffusion_benchmark_compute.yaml

run:
timeout: 1800
script: python stable_diffusion_benchmark.py

variations:
- __suffix__: aws
- __suffix__: gce
env: gce
frequency: manual
cluster:
cluster_compute: stable_diffusion_benchmark_compute_gce.yaml

- name: streaming_data_ingest_benchmark_1tb
group: data-tests
working_dir: nightly_tests/dataset
Expand Down Expand Up @@ -545,177 +521,49 @@
cluster_compute: shuffle/datasets_large_scale_compute_small_instances_gce.yaml


############################
# Batch Inference Benchmarks
############################

# 10 GB image classification raw images with 1 GPU.
# 1 g4dn.4xlarge
- name: torch_batch_inference_1_gpu_10gb_raw
group: data-tests
working_dir: nightly_tests/dataset

frequency: nightly
team: data
cluster:
byod:
type: gpu
cluster_compute: compute_gpu_1_cpu_16_aws.yaml

run:
timeout: 500
script: python gpu_batch_inference.py --data-directory=10G-image-data-synthetic-raw --data-format raw

alert: default

variations:
- __suffix__: aws
- __suffix__: gce
env: gce
frequency: manual
cluster:
cluster_compute: compute_gpu_1_cpu_16_gce.yaml

# 10 GB image classification parquet with 1 GPU.
# 1 g4dn.4xlarge
- name: torch_batch_inference_1_gpu_10gb_parquet
group: data-tests
working_dir: nightly_tests/dataset

frequency: nightly
team: data
cluster:
byod:
type: gpu
cluster_compute: compute_gpu_1_cpu_16_aws.yaml

run:
timeout: 500
script: python gpu_batch_inference.py --data-directory=10G-image-data-synthetic-raw-parquet --data-format parquet

alert: default

variations:
- __suffix__: aws
- __suffix__: gce
env: gce
frequency: manual
cluster:
cluster_compute: compute_gpu_1_cpu_16_gce.yaml

#######################
# Batch inference tests
#######################

# 300 GB image classification raw images with 16 GPUs
# 4 g4dn.12xlarge
- name: torch_batch_inference_16_gpu_300gb_raw
group: data-tests
working_dir: nightly_tests/dataset
# 300 GB image classification parquet data up to 10 GPUs
# 10 g4dn.12xlarge.
- name: batch_inference

frequency: nightly
team: data
cluster:
byod:
type: gpu
cluster_compute: compute_gpu_4x4_aws.yaml
cluster_compute: autoscaling_gpu_compute.yaml

run:
timeout: 1000
script: python gpu_batch_inference.py --data-directory 300G-image-data-synthetic-raw --data-format raw

wait_for_nodes:
num_nodes: 4

alert: default

variations:
- __suffix__: aws
- __suffix__: gce
env: gce
frequency: manual
cluster:
cluster_compute: compute_gpu_4x4_gce.yaml

timeout: 1800
script: >
python gpu_batch_inference.py
--data-directory 300G-image-data-synthetic-raw-parquet --data-format parquet
- name: chaos_torch_batch_inference_16_gpu_300gb_raw
group: data-tests
- name: batch_inference_chaos
stable: False
# Don't use 'nightly_tests/dataset' as the working directory because we need to run
# the 'setup_chaos.py' script.
working_dir: nightly_tests
stable: false

frequency: nightly
team: data
cluster:
byod:
type: gpu
cluster_compute: dataset/compute_gpu_4x4_aws.yaml
cluster_compute: dataset/autoscaling_gpu_compute.yaml

run:
timeout: 1000
timeout: 1800
prepare: python setup_chaos.py --max-to-kill 2 --kill-delay 30
script: python dataset/gpu_batch_inference.py --data-directory 300G-image-data-synthetic-raw --data-format raw

wait_for_nodes:
num_nodes: 4

alert: default

variations:
- __suffix__: aws
- __suffix__: gce
env: gce
frequency: manual
cluster:
cluster_compute: dataset/compute_gpu_4x4_gce.yaml


# 300 GB image classification parquet data with 16 GPUs
# 4 g4dn.12xlarge
- name: torch_batch_inference_16_gpu_300gb_parquet
group: data-tests
working_dir: nightly_tests/dataset

frequency: nightly
team: data

cluster:
byod:
type: gpu
cluster_compute: compute_gpu_4x4_aws.yaml

run:
timeout: 1000
script: python gpu_batch_inference.py --data-directory 300G-image-data-synthetic-raw-parquet --data-format parquet

wait_for_nodes:
num_nodes: 4

alert: default

variations:
- __suffix__: aws
- __suffix__: gce
env: gce
frequency: manual
cluster:
cluster_compute: compute_gpu_4x4_gce.yaml
script: >
python dataset/gpu_batch_inference.py
--data-directory 300G-image-data-synthetic-raw-parquet --data-format parquet
# 10 TB image classification parquet data with heterogenous cluster
# 10 TB image classification parquet data with autoscaling heterogenous cluster
# 10 g4dn.12xlarge, 10 m5.16xlarge
- name: torch_batch_inference_hetero_10tb_parquet
group: data-tests
working_dir: nightly_tests/dataset

- name: batch_inference_hetero
frequency: weekly
team: data

cluster:
byod:
type: gpu
cluster_compute: compute_hetero_10x10_aws.yaml
cluster_compute: autoscaling_hetero_compute.yaml

run:
timeout: 2000
script: python gpu_batch_inference.py --data-directory 10T-image-data-synthetic-raw-parquet --data-format parquet

wait_for_nodes:
num_nodes: 20

alert: default
timeout: 3600
script: >
python gpu_batch_inference.py
--data-directory 10T-image-data-synthetic-raw-parquet --data-format parquet

0 comments on commit f4f2785

Please sign in to comment.