Save Databricks init scripts in the workspace [databricks] (#8961)

* Save Databricks init scripts in the workspace [databricks] To close: #8916 Init scripts in DBFS will be deprecated sept 1 2023. Move Databricks init scripts in the workspace Signed-off-by: Tim Liu <timl@nvidia.com> * Change Databricks init scripts to workspace from dbfs for docs Signed-off-by: Tim Liu <timl@nvidia.com> * Reuse init scripts path Signed-off-by: Tim Liu <timl@nvidia.com> * Update copyright Signed-off-by: Tim Liu <timl@nvidia.com> * Move uploading and cleaning Databricks workspace initscripts into the common functions Signed-off-by: Tim Liu <timl@nvidia.com> --------- Signed-off-by: Tim Liu <timl@nvidia.com>
NVIDIA · Aug 25, 2023 · 69ac83c · 69ac83c
1 parent daedfe5
commit 69ac83c
Show file tree

Hide file tree

Showing 7 changed files with 12 additions and 16 deletions.
diff --git a/docs/additional-functionality/rapids-shuffle.md b/docs/additional-functionality/rapids-shuffle.md
@@ -394,7 +394,7 @@ sudo dpkg -i ucx-1.14.0.deb ucx-cuda-1.14.0.deb &&
 rm ucx-1.14.0-ubuntu20.04-mofed5-cuda11.tar.bz2 ucx-1.14.0.deb ucx-cuda-1.14.0.deb
 ```
 
-Save the script in DBFS and add it to the "Init Scripts" list:
+Save the script in Databricks workspace and add it to the "Init Scripts" list:
 
 ![Init scripts panel showing UCX init script](../img/Databricks/initscript_ucx.png)
 

diff --git a/docs/img/Databricks/initscript_ucx.png b/docs/img/Databricks/initscript_ucx.png
diff --git a/jenkins/Jenkinsfile-blossom.premerge-databricks b/jenkins/Jenkinsfile-blossom.premerge-databricks
@@ -65,7 +65,7 @@ pipeline {
         DATABRICKS_PUBKEY = credentials("SPARK_DATABRICKS_PUBKEY")
         DATABRICKS_DRIVER = DbUtils.getDriver("$DB_TYPE")
         DATABRICKS_WORKER = DbUtils.getWorker("$DB_TYPE")
-        INIT_SCRIPTS_DIR = "dbfs:/databricks/init_scripts/${BUILD_TAG}"
+        INIT_SCRIPTS_DIR = "/databricks/init_scripts/${BUILD_TAG}"
     }
 
     stages {
@@ -129,7 +129,7 @@ String getDbType() {
 void databricksBuild() {
     def CLUSTER_ID = ''
     def SPARK_MAJOR = BASE_SPARK_VERSION_TO_INSTALL_DATABRICKS_JARS.replace('.', '')
-    def dbfs_path = "$INIT_SCRIPTS_DIR-$DB_TYPE"
+    def ws_path = "$INIT_SCRIPTS_DIR-$DB_TYPE"
     try {
         stage("Create $SPARK_MAJOR DB") {
             script {
@@ -142,12 +142,8 @@ void databricksBuild() {
 
                     // handle init scripts if exist
                     if (env.INIT_SCRIPTS) {
-                        sh "bash -c 'dbfs mkdirs $dbfs_path'"
-                        env.INIT_SCRIPTS.split(',').each {
-                            sh "bash -c 'dbfs cp --overwrite jenkins/databricks/${it} $dbfs_path'"
-                        }
-                        // foo.sh,bar.sh --> dbfs:/path/foo.sh,dbfs:/path/bar.sh
-                        CREATE_PARAMS += " -f $dbfs_path/" + env.INIT_SCRIPTS.replace(',', ",$dbfs_path/")
+                        // foo.sh,bar.sh --> /path/foo.sh,/path/bar.sh
+                        CREATE_PARAMS += " -f " + DbUtils.uploadFiles(this, env.INIT_SCRIPTS, ws_path)
                     }
 
                     CLUSTER_ID = sh(script: "python3 ./jenkins/databricks/create.py $CREATE_PARAMS",
@@ -194,7 +190,7 @@ void databricksBuild() {
             container('cpu') {
                 retry(3) {
                     if (env.INIT_SCRIPTS) {
-                        sh "bash -c 'dbfs rm -r $dbfs_path'"
+                        DbUtils.cleanUp(this, ws_path)
                     }
                     sh "python3 ./jenkins/databricks/shutdown.py -s $DATABRICKS_HOST -t $DATABRICKS_TOKEN -c $CLUSTER_ID -d"
                 }

diff --git a/jenkins/databricks/clusterutils.py b/jenkins/databricks/clusterutils.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2021, NVIDIA CORPORATION.
+# Copyright (c) 2020-2023, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -52,7 +52,7 @@ def generate_create_templ(sshKey, cluster_name, runtime, idle_timeout,
             for path in path_list:
                 templ['init_scripts'].append(
                     {
-                        'dbfs' : {
+                        'workspace' : {
                             'destination' : path
                         }
                     }

diff --git a/jenkins/databricks/create.py b/jenkins/databricks/create.py
@@ -34,7 +34,7 @@ def main():
   worker_type = 'g4dn.xlarge'
   driver_type = 'g4dn.xlarge'
   cloud_provider = 'aws'
-  # comma separated init scripts, e.g. dbfs:/foo,dbfs:/bar,...
+  # comma separated init scripts in Databricks workspace, e.g. /foo,/bar,...
   init_scripts = ''
   aws_zone='us-west-2c'
 

diff --git a/jenkins/databricks/init_cuda11_runtime.sh b/jenkins/databricks/init_cuda11_runtime.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 #
-# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2021-2023, NVIDIA CORPORATION. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -16,7 +16,7 @@
 #
 
 # The init script to install cuda11.0 toolkit
-# Will be automatically pushed into the dbfs:/databricks/init_scripts once it is updated.
+# Will be automatically pushed into the Databricks workspace: /databricks/init_scripts once it is updated.
 
 wget http://developer.download.nvidia.com/compute/cuda/11.0.2/local_installers/cuda_11.0.2_450.51.05_linux.run
 

diff --git a/jenkins/databricks/init_cudf_udf.sh b/jenkins/databricks/init_cudf_udf.sh
@@ -16,7 +16,7 @@
 #
 
 # The initscript to set up environment for the cudf_udf tests on Databricks
-# Will be automatically pushed into the dbfs:/databricks/init_scripts once it is updated.
+# Will be automatically pushed into the Databricks Workspace: /databricks/init_scripts/ once it is updated.
 
 set -ex
-Original file line number
+Diff line change
@@ Expand Up / @@ -16,7 +16,7 @@ @@
     #
     # The initscript to set up environment for the cudf_udf tests on Databricks
-    # Will be automatically pushed into the dbfs:/databricks/init_scripts once it is updated.
+    # Will be automatically pushed into the Databricks Workspace: /databricks/init_scripts/ once it is updated.
     set -ex
@@ Expand Down @@