cleanup and add 15.4 as db version option

Signed-off-by: Erik Ordentlich <eordentlich@gmail.com>
NVIDIA · Aug 23, 2024 · 40b9e84 · 40b9e84
1 parent ba06701
commit 40b9e84
Show file tree

Hide file tree

Showing 3 changed files with 13 additions and 12 deletions.
diff --git a/python/benchmark/benchmark/bench_kmeans.py b/python/benchmark/benchmark/bench_kmeans.py
@@ -13,16 +13,16 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
+import os
 import time
 from typing import Any, Dict, Iterator, List, Optional, Union
 
 import numpy as np
-import os
 import pandas as pd
+import pyspark.sql.functions as F
 from pyspark.ml.feature import VectorAssembler
 from pyspark.ml.functions import array_to_vector, vector_to_array
 from pyspark.sql import DataFrame, SparkSession
-import pyspark.sql.functions as F
 from pyspark.sql.functions import array, col, sum
 from pyspark.sql.types import DoubleType, StructField, StructType
 
@@ -195,11 +195,13 @@ def gpu_cache_df(df: DataFrame) -> DataFrame:
             # temporary patch for DB with spark-rapids plugin
             # this part is not timed so overhead is not critical, but should be reverted
             # once https://github.com/NVIDIA/spark-rapids/issues/10770 is fixed
-            db_version = os.environ.get('DATABRICKS_RUNTIME_VERSION')
+            db_version = os.environ.get("DATABRICKS_RUNTIME_VERSION")
             if db_version:
-                dim=len(cluster_centers[0])
+                dim = len(cluster_centers[0])
                 # inject unsupported expr (slice) that is essentially a noop
-                df_for_scoring = df_for_scoring.select(F.slice(feature_col,1,dim).alias(feature_col), output_col)
+                df_for_scoring = df_for_scoring.select(
+                    F.slice(feature_col, 1, dim).alias(feature_col), output_col
+                )
 
         if num_cpus > 0:
             from pyspark.ml.clustering import KMeans as SparkKMeans

diff --git a/python/benchmark/databricks/README.md b/python/benchmark/databricks/README.md
@@ -46,7 +46,7 @@ This directory contains shell scripts for running larger scale benchmarks on Dat
 
     The script creates a cpu or gpu cluster, respectively using the cluster specs in [cpu_cluster_spec](./cpu_cluster_spec.sh), [gpu_cluster_spec](./gpu_cluster_spec.sh), [gpu_etl_cluster_spec](./gpu_etl_cluster_spec.sh), depending on the supplied argument.  In gpu and gpu_etl mode each algorithm benchmark is run 3 times, and similarly in cpu mode, except for kmeans and random forest classifier and regressor which are each run 1 time due to their long running times.  gpu_etl mode also uses the [spark-rapids](https://github.com/NVIDIA/spark-rapids) gpu accelerated plugin.
 
-    An optional databricks runtime version can be supplied as second argument.  The default is 13.3 if not specified.   Runtime 14.3 can not be specified in gpu_etl mode as it is not yet compatible with the spark-rapids plugin.  
+    An optional databricks runtime version can be supplied as a second argument, with 13.3 being the default if not specified.   Runtimes higher than 13.3 are only compatible with cpu and gpu modes (i.e. not gpu_etl) as they are not yet supported by the spark-rapids plugin.  
 
 3. The file `benchmark_log` will have the fit/train/transform running times and accuracy scores.  A simple convenience script has been provided to extract timing information for each run:
     ```bash
@@ -58,6 +58,6 @@ This directory contains shell scripts for running larger scale benchmarks on Dat
   databricks jobs cancel-run <runid> --profile $DB_PROFILE
   ```
 
-5. The created clusters are configured to terminate after 30 min, but can be manually terminated or deleted via the Databricks UI.
+1. The created clusters are configured to terminate after 30 min, but can be manually terminated or deleted via the Databricks UI.
 
-6. Monitor progress periodically in case of a possible hang, to avoid incurring cloud costs in such cases.
+2. Monitor progress periodically in case of a possible hang, to avoid incurring cloud costs in such cases.
diff --git a/python/benchmark/databricks/run_benchmark.sh b/python/benchmark/databricks/run_benchmark.sh
@@ -1,7 +1,7 @@
 #!/bin/bash
 
-cluster_type=$1
-db_version=${$2:-13.3}
+cluster_type=${1:-gpu_etl}
+db_version=${2:-13.3}
 
 if [[ $cluster_type == "gpu" || $cluster_type == "gpu_etl" ]]; then
     num_cpus=0
@@ -11,7 +11,7 @@ elif [[ $cluster_type == "cpu" ]]; then
     num_gpus=0
 else
     echo "unknown cluster type $cluster_type"
-    echo "usage: $0 cpu|gpu|gpu_etl"
+    echo "usage: $0 cpu|gpu|gpu_etl [12.2|13.3|14.3|15.4]" 
     exit 1
 fi
 
@@ -23,7 +23,6 @@ fi
 
 source benchmark_utils.sh
 
-#BENCHMARK_DATA_HOME=/spark-rapids-ml/benchmarking/datasets
 BENCHMARK_DATA_HOME=s3a://spark-rapids-ml-bm-datasets-public
 
 # creates cluster and sets CLUSTER_ID equal to created cluster's id