NVIDIA · amahussein · Jul 26, 2024 · Jul 25, 2024 · Jul 25, 2024 · Jul 25, 2024
diff --git a/user_tools/src/spark_rapids_pytools/rapids/qualification.py b/user_tools/src/spark_rapids_pytools/rapids/qualification.py
diff --git a/user_tools/src/spark_rapids_pytools/resources/qualification-conf.yaml b/user_tools/src/spark_rapids_pytools/resources/qualification-conf.yaml
@@ -42,6 +42,7 @@ toolOutput:
             - 'Strongly Recommended'
             - 'Recommended'
       groupColumns:
+         enabled: false
          keys:
            - Vendor
            - Driver Host

diff --git a/user_tools/src/spark_rapids_pytools/wrapper.py b/user_tools/src/spark_rapids_pytools/wrapper.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -19,19 +19,15 @@
 from spark_rapids_pytools.wrappers.databricks_aws_wrapper import DBAWSWrapper
 from spark_rapids_pytools.wrappers.databricks_azure_wrapper import DBAzureWrapper
 from spark_rapids_pytools.wrappers.dataproc_wrapper import DataprocWrapper
-from spark_rapids_pytools.wrappers.dataproc_gke_wrapper import DataprocGKEWrapper
 from spark_rapids_pytools.wrappers.emr_wrapper import EMRWrapper
-from spark_rapids_pytools.wrappers.onprem_wrapper import OnPremWrapper
 
 
 def main():
     fire.Fire({
         'emr': EMRWrapper,
         'dataproc': DataprocWrapper,
-        'dataproc-gke': DataprocGKEWrapper,
         'databricks-aws': DBAWSWrapper,
         'databricks-azure': DBAzureWrapper,
-        'onprem': OnPremWrapper
     })
 
 

diff --git a/user_tools/src/spark_rapids_pytools/wrappers/databricks_aws_wrapper.py b/user_tools/src/spark_rapids_pytools/wrappers/databricks_aws_wrapper.py
diff --git a/user_tools/src/spark_rapids_pytools/wrappers/databricks_azure_wrapper.py b/user_tools/src/spark_rapids_pytools/wrappers/databricks_azure_wrapper.py
@@ -14,233 +14,17 @@
 
 
 """Wrapper class to run tools associated with RAPIDS Accelerator for Apache Spark plugin on DATABRICKS_AZURE."""
-from spark_rapids_pytools.cloud_api.sp_types import DeployMode
-from spark_rapids_pytools.common.utilities import Utils, ToolLogging
+
+from spark_rapids_pytools.common.utilities import ToolLogging
 from spark_rapids_pytools.rapids.diagnostic import Diagnostic
-from spark_rapids_pytools.rapids.profiling import ProfilingAsLocal
-from spark_rapids_pytools.rapids.qualification import QualFilterApp, QualificationAsLocal, QualGpuClusterReshapeType
 from spark_rapids_tools import CspEnv
-from spark_rapids_tools.utils import Utilities
 
 
 class CliDBAzureLocalMode:  # pylint: disable=too-few-public-methods
     """
     A wrapper that runs the RAPIDS Accelerator tools locally on the dev machine for DATABRICKS_AZURE.
     """
 
-    @staticmethod
-    def qualification(cpu_cluster: str = None,
-                      eventlogs: str = None,
-                      profile: str = None,
-                      local_folder: str = None,
-                      remote_folder: str = None,
-                      gpu_cluster: str = None,
-                      tools_jar: str = None,
-                      credentials_file: str = None,
-                      filter_apps: str = QualFilterApp.tostring(QualFilterApp.get_default()),
-                      gpu_cluster_recommendation: str = QualGpuClusterReshapeType.tostring(
-                          QualGpuClusterReshapeType.get_default()),
-                      estimation_model: str = None,
-                      jvm_heap_size: int = None,
-                      verbose: bool = None,
-                      cpu_discount: int = None,
-                      gpu_discount: int = None,
-                      global_discount: int = None,
-                      **rapids_options) -> None:
-        """
-        The Qualification tool analyzes Spark events generated from CPU based Spark applications to
-        help quantify the expected acceleration and costs savings of migrating a Spark application
-        or query to GPU. The wrapper downloads dependencies and executes the analysis on the local
-        dev machine.
-        :param cpu_cluster: The Databricks-cluster on which the Spark applications were executed. The argument
-                can be a Databricks-cluster ID or a valid path to the cluster's properties file (json format)
-                generated by the databricks-CLI.
-        :param  eventlogs: Event log filenames or ABFS (Azure Blob File System) storage directories
-                containing event logs (comma separated). If missing, the wrapper reads the Spark's
-                property `spark.eventLog.dir` defined in `cpu_cluster`. This property should be included
-                in the output of `databricks clusters get CLUSTER_ID [flags]`.
-                Note that the wrapper will raise an exception if the property is not set.
-        :param profile: A named Databricks profile to get the settings/credentials of the Databricks CLI.
-        :param local_folder: Local work-directory path to store the output and to be used as root
-                directory for temporary folders/files. The final output will go into a subdirectory called
-                ${local_folder}/qual-${EXEC_ID} where exec_id is an auto-generated unique identifier of the
-                execution. If the argument is NONE, the default value is the env variable
-                RAPIDS_USER_TOOLS_OUTPUT_DIRECTORY if any; or the current working directory.
-        :param remote_folder: An ABFS (Azure Blob File System) folder where the output is uploaded at the end
-                of execution. If no value is provided, the output will be only available on local disk.
-        :param gpu_cluster: The Databricks-cluster on which the Spark applications are planned to be migrated.
-                The argument can be a Databricks-cluster ID or a valid path to the cluster's properties file
-                (json format) generated by the databricks-CLI. If missing, the wrapper maps the databricks machine
-                instances of the original cluster into databricks instances that support GPU acceleration.
-        :param tools_jar: Path to a bundled jar including Rapids tool. The path is a local filesystem,
-                or remote ABFS url. If missing, the wrapper downloads the latest rapids-4-spark-tools_*.jar
-                from maven repo.
-        :param credentials_file: The local path of JSON file that contains the application credentials.
-               If missing, the wrapper looks for "DATABRICKS_CONFIG_FILE" environment variable
-               to provide the location of a credential file. The default credentials file exists as
-               "~/.databrickscfg" on Unix, Linux, or macOS
-        :param filter_apps: filtering criteria of the applications listed in the final STDOUT table
-                is one of the following (all, speedups, savings, top_candidates).
-                Note that this filter does not affect the CSV report.
-                "all" means no filter applied. "speedups" lists all the apps that are either
-                'Recommended', or 'Strongly Recommended' based on speedups. "savings"
-                lists all the apps that have positive estimated GPU savings except for the apps that
-                are "Not Applicable". "top_candidates" lists all apps that have unsupported operators
-                stage duration less than 25% of app duration and speedups greater than 1.3x.
-        :param gpu_cluster_recommendation: The type of GPU cluster recommendation to generate.
-               It accepts one of the following ("CLUSTER", "JOB" and the default value "MATCH").
-                "MATCH": keep GPU cluster same number of nodes as CPU cluster;
-                "CLUSTER": recommend optimal GPU cluster by cost for entire cluster;
-                "JOB": recommend optimal GPU cluster by cost per job.
-        :param estimation_model: Model used to calculate the estimated GPU duration and cost savings.
-               It accepts one of the following:
-               "xgboost": an XGBoost model for GPU duration estimation
-               "speedups": set by default. It uses a simple static estimated speedup per operator.
-        :param jvm_heap_size: The maximum heap size of the JVM in gigabytes.
-        :param verbose: True or False to enable verbosity to the wrapper script.
-        :param cpu_discount: A percent discount for the cpu cluster cost in the form of an integer value
-                (e.g. 30 for 30% discount).
-        :param gpu_discount: A percent discount for the gpu cluster cost in the form of an integer value
-                (e.g. 30 for 30% discount).
-        :param global_discount: A percent discount for both the cpu and gpu cluster costs in the form of an
-                integer value (e.g. 30 for 30% discount).
-        :param rapids_options: A list of valid Qualification tool options.
-                Note that the wrapper ignores ["output-directory", "platform"] flags, and it does not support
-                multiple "spark-property" arguments.
-                For more details on Qualification tool options, please visit
-                https://docs.nvidia.com/spark-rapids/user-guide/latest/qualification/jar-usage.html#running-the-qualification-tool-standalone-on-spark-event-logs
-        """
-        verbose = Utils.get_value_or_pop(verbose, rapids_options, 'v', False)
-        profile = Utils.get_value_or_pop(profile, rapids_options, 'p')
-        remote_folder = Utils.get_value_or_pop(remote_folder, rapids_options, 'r')
-        jvm_heap_size = Utils.get_value_or_pop(jvm_heap_size, rapids_options, 'j',
-                                               Utilities.get_system_memory_in_gb())
-        eventlogs = Utils.get_value_or_pop(eventlogs, rapids_options, 'e')
-        filter_apps = Utils.get_value_or_pop(filter_apps, rapids_options, 'f')
-        tools_jar = Utils.get_value_or_pop(tools_jar, rapids_options, 't')
-        local_folder = Utils.get_value_or_pop(local_folder, rapids_options, 'l')
-        if verbose:
-            # when debug is set to true set it in the environment.
-            ToolLogging.enable_debug_mode()
-        wrapper_qual_options = {
-            'platformOpts': {
-                # the databricks profile
-                'profile': profile,
-                'credentialFile': credentials_file,
-                'deployMode': DeployMode.LOCAL,
-            },
-            'migrationClustersProps': {
-                'cpuCluster': cpu_cluster,
-                'gpuCluster': gpu_cluster
-            },
-            'jobSubmissionProps': {
-                'remoteFolder': remote_folder,
-                'platformArgs': {
-                    'jvmMaxHeapSize': jvm_heap_size
-                }
-            },
-            'eventlogs': eventlogs,
-            'filterApps': filter_apps,
-            'toolsJar': tools_jar,
-            'gpuClusterRecommendation': gpu_cluster_recommendation,
-            'cpuDiscount': cpu_discount,
-            'gpuDiscount': gpu_discount,
-            'globalDiscount': global_discount,
-            'estimationModel': estimation_model
-        }
-        QualificationAsLocal(platform_type=CspEnv.DATABRICKS_AZURE,
-                             cluster=None,
-                             output_folder=local_folder,
-                             wrapper_options=wrapper_qual_options,
-                             rapids_options=rapids_options).launch()
-
-    @staticmethod
-    def profiling(gpu_cluster: str = None,
-                  worker_info: str = None,
-                  eventlogs: str = None,
-                  profile: str = None,
-                  local_folder: str = None,
-                  remote_folder: str = None,
-                  tools_jar: str = None,
-                  credentials_file: str = None,
-                  jvm_heap_size: int = None,
-                  verbose: bool = None,
-                  **rapids_options) -> None:
-        """
-        The Profiling tool analyzes both CPU or GPU generated event logs and generates information
-        which can be used for debugging and profiling Apache Spark applications.
-        :param  gpu_cluster: The Databricks-cluster on which the Spark applications were executed. The argument
-                can be a Databricks-cluster ID or a valid path to the cluster's properties file (json format)
-                generated by the databricks-CLI. If missing, then the argument worker_info has to be provided.
-        :param  worker_info: A path pointing to a yaml file containing the system information of a
-                worker node. It is assumed that all workers are homogenous.
-                If missing, the wrapper pulls the worker info from the "gpu_cluster".
-        :param  eventlogs: Event log filenames or ABFS (Azure Blob File System) storage directories
-                containing event logs (comma separated). If missing, the wrapper reads the Spark's
-                property `spark.eventLog.dir` defined in `gpu_cluster`. This property should be included
-                in the output of `databricks clusters get CLUSTER_ID [flags]`.
-                Note that the wrapper will raise an exception if the property is not set.
-        :param profile: A named Databricks profile to get the settings/credentials of the Databricks CLI.
-        :param local_folder: Local work-directory path to store the output and to be used as root
-                directory for temporary folders/files. The final output will go into a subdirectory called
-                ${local_folder}/prof-${EXEC_ID} where exec_id is an auto-generated unique identifier of the
-                execution. If the argument is NONE, the default value is the env variable
-                RAPIDS_USER_TOOLS_OUTPUT_DIRECTORY if any; or the current working directory.
-        :param remote_folder: An ABFS (Azure Blob File System) folder where the output is uploaded at the end
-                of execution. If no value is provided, the output will be only available on local disk.
-        :param tools_jar: Path to a bundled jar including Rapids tool. The path is a local filesystem,
-                or remote S3 url. If missing, the wrapper downloads the latest rapids-4-spark-tools_*.jar
-                from maven repo.
-        :param credentials_file: The local path of JSON file that contains the application credentials.
-               If missing, the wrapper looks for "DATABRICKS_CONFIG_FILE" environment variable
-               to provide the location of a credential file. The default credentials file exists as
-               "~/.databrickscfg" on Unix, Linux, or macOS.
-        :param verbose: True or False to enable verbosity to the wrapper script.
-        :param jvm_heap_size: The maximum heap size of the JVM in gigabytes.
-        :param rapids_options: A list of valid Profiling tool options.
-                Note that the wrapper ignores ["output-directory", "worker-info"] flags, and it does not support
-                multiple "spark-property" arguments.
-                For more details on Profiling tool options, please visit
-                https://docs.nvidia.com/spark-rapids/user-guide/latest/profiling/jar-usage.html#prof-tool-title-options
-        """
-        verbose = Utils.get_value_or_pop(verbose, rapids_options, 'v', False)
-        profile = Utils.get_value_or_pop(profile, rapids_options, 'p')
-        credentials_file = Utils.get_value_or_pop(credentials_file, rapids_options, 'c')
-        gpu_cluster = Utils.get_value_or_pop(gpu_cluster, rapids_options, 'g')
-        remote_folder = Utils.get_value_or_pop(remote_folder, rapids_options, 'r')
-        jvm_heap_size = Utils.get_value_or_pop(jvm_heap_size, rapids_options, 'j', Utilities.get_system_memory_in_gb())
-        eventlogs = Utils.get_value_or_pop(eventlogs, rapids_options, 'e')
-        tools_jar = Utils.get_value_or_pop(tools_jar, rapids_options, 't')
-        worker_info = Utils.get_value_or_pop(worker_info, rapids_options, 'w')
-        local_folder = Utils.get_value_or_pop(local_folder, rapids_options, 'l')
-        if verbose:
-            # when debug is set to true set it in the environment.
-            ToolLogging.enable_debug_mode()
-        wrapper_prof_options = {
-            'platformOpts': {
-                # the databricks profile
-                'profile': profile,
-                'credentialFile': credentials_file,
-                'deployMode': DeployMode.LOCAL,
-            },
-            'migrationClustersProps': {
-                'gpuCluster': gpu_cluster
-            },
-            'jobSubmissionProps': {
-                'remoteFolder': remote_folder,
-                'platformArgs': {
-                    'jvmMaxHeapSize': jvm_heap_size
-                }
-            },
-            'eventlogs': eventlogs,
-            'toolsJar': tools_jar,
-            'autoTunerFileInput': worker_info
-        }
-        ProfilingAsLocal(platform_type=CspEnv.DATABRICKS_AZURE,
-                         output_folder=local_folder,
-                         wrapper_options=wrapper_prof_options,
-                         rapids_options=rapids_options).launch()
-
     @staticmethod
     def diagnostic(cluster: str,
                    profile: str = None,
@@ -294,10 +78,8 @@ def diagnostic(cluster: str,
 
 class DBAzureWrapper:  # pylint: disable=too-few-public-methods
     """
-    A wrapper script to run RAPIDS Accelerator tools (Qualification, Profiling, and Diagnostic) on Databricks_Azure.
+    A wrapper script to run RAPIDS Accelerator tools Diagnostic on Databricks_Azure.
     """
 
     def __init__(self):
-        self.qualification = CliDBAzureLocalMode.qualification
-        self.profiling = CliDBAzureLocalMode.profiling
         self.diagnostic = CliDBAzureLocalMode.diagnostic