diff --git a/data-processing-lib/doc/advanced-transform-tutorial.md b/data-processing-lib/doc/advanced-transform-tutorial.md index c2a5b37da..1627d2ebd 100644 --- a/data-processing-lib/doc/advanced-transform-tutorial.md +++ b/data-processing-lib/doc/advanced-transform-tutorial.md @@ -57,11 +57,11 @@ from typing import Any import pyarrow as pa import ray from data_processing.data_access import DataAccessFactory -from data_processing.launch.ray import ( - RayLauncherConfiguration, - DefaultTableTransformRuntimeRay, - RayUtils, - RayTransformLauncher, +from data_processing.runtime.ray import ( + RayLauncherConfiguration, + DefaultTableTransformRuntimeRay, + RayUtils, + RayTransformLauncher, ) from data_processing.transform import AbstractTableTransform from data_processing.utils import GB, TransformUtils @@ -70,10 +70,10 @@ from ray.actor import ActorHandle class EdedupTransform(AbstractTableTransform): - def __init__(self, config: dict): - super().__init__(config) - self.doc_column = config.get("doc_column", "") - self.hashes = config.get("hashes", []) + def __init__(self, config: dict): + super().__init__(config) + self.doc_column = config.get("doc_column", "") + self.hashes = config.get("hashes", []) ``` The `EdedupTransform` class extends the `AbstractTableTransform`, which defines the required methods. @@ -136,7 +136,7 @@ If there is no metadata then simply return an empty dictionary. First, let's define the transform runtime class. To do this we extend the base abstract/interface class -[DefaultTableTransformRuntime](../src/data_processing/launch/ray/transform_runtime.py), +[DefaultTableTransformRuntime](../src/data_processing/runtime/ray/transform_runtime.py), which requires definition of the following: * an initializer (i.e. `init()`) that accepts a dictionary of configuration data. For this example, the configuration data will only be defined by diff --git a/data-processing-lib/doc/architecture.md b/data-processing-lib/doc/architecture.md index 4466e7e38..34ca36258 100644 --- a/data-processing-lib/doc/architecture.md +++ b/data-processing-lib/doc/architecture.md @@ -13,10 +13,10 @@ process many input files in parallel using a distribute network of RayWorkers. The architecture includes the following core components: -* [RayLauncher](../src/data_processing/launch/ray/transform_launcher.py) accepts and validates +* [RayLauncher](../src/data_processing/runtime/ray/transform_launcher.py) accepts and validates CLI parameters to establish the Ray Orchestrator with the proper configuration. It uses the following components, all of which can/do define CLI configuration parameters.: - * [Transform Orchestrator Configuration](../src/data_processing/launch/ray/transform_orchestrator_configuration.py) is responsible + * [Transform Orchestrator Configuration](../src/data_processing/runtime/ray/transform_orchestrator_configuration.py) is responsible for defining and validating infrastructure parameters (e.g., number of workers, memory and cpu, local or remote cluster, etc.). This class has very simple state (several dictionaries) and is fully pickleable. As a result framework uses its instance as a @@ -25,14 +25,14 @@ It uses the following components, all of which can/do define CLI configuration p configuration for the type of DataAccess to use when reading/writing the input/output data for the transforms. Similar to Transform Orchestrator Configuration, this is a pickleable instance that is passed between Launcher, Orchestrator and Workers. - * [TransformConfiguration](../src/data_processing/launch/ray/transform_runtime.py) - defines specifics + * [TransformConfiguration](../src/data_processing/runtime/ray/transform_runtime.py) - defines specifics of the transform implementation including transform implementation class, its short name, any transform- specific CLI parameters, and an optional TransformRuntime class, discussed below. After all parameters are validated, the ray cluster is started and the DataAccessFactory, TransformOrchestratorConfiguraiton and TransformConfiguration are given to the Ray Orchestrator, via Ray remote() method invocation. The Launcher waits for the Ray Orchestrator to complete. -* [Ray Orchestrator](../src/data_processing/launch/ray/transform_orchestrator.py) is responsible for overall management of +* [Ray Orchestrator](../src/data_processing/runtime/ray/transform_orchestrator.py) is responsible for overall management of the data processing job. It creates the actors, determines the set of input data and distributes the references to the data files to be processed by the workers. More specifically, it performs the following: 1. Uses the DataAccess instance created by the DataAccessFactory to determine the set of the files @@ -53,12 +53,12 @@ It uses the following components, all of which can/do define CLI configuration p Once all data is processed, the orchestrator will collect execution statistics (from the statistics actor) and build and save it in the form of execution metadata (`metadata.json`). Finally, it will return the execution result to the Launcher. -* [Ray worker](../src/data_processing/launch/ray/transform_table_processor.py) is responsible for +* [Ray worker](../src/data_processing/runtime/ray/transform_table_processor.py) is responsible for reading files (as [PyArrow Tables](https://levelup.gitconnected.com/deep-dive-into-pyarrow-understanding-its-features-and-benefits-2cce8b1466c8)) assigned by the orchestrator, applying the transform to the input table and writing out the resulting table(s). Metadata produced by each table transformation is aggregated into Transform Statistics (below). -* [Transform Statistics](../src/data_processing/launch/ray/transform_statistics.py) is a general +* [Transform Statistics](../src/data_processing/runtime/ray/transform_statistics.py) is a general purpose data collector actor aggregating the numeric metadata from different places of the framework (especially metadata produced by the transform). These statistics are reported as metadata (`metadata.json`) by the orchestrator upon completion. @@ -92,10 +92,10 @@ For a more complete discussion, see the [tutorials](transform-tutorials.md). of any transform implementation - `transform()` and `flush()` - and provides the bulk of any transform implementation convert one Table to 0 or more new Tables. In general, this is not tied to the above Ray infrastructure and so can usually be used independent of Ray. -* [TransformRuntime ](../src/data_processing/launch/ray/transform_runtime.py) - this class only needs to be +* [TransformRuntime ](../src/data_processing/runtime/ray/transform_runtime.py) - this class only needs to be extended/implemented when additional Ray components (actors, shared memory objects, etc.) are used by the transform. The main method `get_transform_config()` is used to enable these extensions. -* [TransformConfiguration](../src/data_processing/launch/ray/transform_runtime.py) - this is the bootstrap +* [TransformConfiguration](../src/data_processing/runtime/ray/transform_runtime.py) - this is the bootstrap class provided to the Launcher that enables the instantiation of the Transform and the TransformRuntime within the architecture. It is a CLIProvider, which allows it to define transform-specific CLI configuration that is made available to the Transform's initializer. diff --git a/data-processing-lib/doc/overview.md b/data-processing-lib/doc/overview.md index bf70e7c3e..2a5f2de3a 100644 --- a/data-processing-lib/doc/overview.md +++ b/data-processing-lib/doc/overview.md @@ -12,10 +12,10 @@ developers of data transformation are: * [Transformation](../src/data_processing/transform/table_transform.py) - a simple, easily-implemented interface defines the specifics of a given data transformation. -* [Transform Configuration](../src/data_processing/launch/ray/transform_runtime.py) - defines +* [Transform Configuration](../src/data_processing/runtime/ray/transform_runtime.py) - defines the transform implementation and runtime classes, the command line arguments specific to transform, and the short name for the transform. -* [Transformation Runtime](../src/data_processing/launch/ray/transform_runtime.py) - allows for customization of the Ray environment for the transformer. +* [Transformation Runtime](../src/data_processing/runtime/ray/transform_runtime.py) - allows for customization of the Ray environment for the transformer. This might include provisioning of shared memory objects or creation of additional actors. To learn more consider the following: diff --git a/data-processing-lib/doc/simplest-transform-tutorial.md b/data-processing-lib/doc/simplest-transform-tutorial.md index deaa588c3..1fed02e88 100644 --- a/data-processing-lib/doc/simplest-transform-tutorial.md +++ b/data-processing-lib/doc/simplest-transform-tutorial.md @@ -54,7 +54,7 @@ from argparse import ArgumentParser, Namespace from typing import Any import pyarrow as pa -from data_processing.launch.ray import ( +from data_processing.runtime.ray import ( RayLauncherConfiguration, DefaultTableTransformRuntimeRay, RayTransformLauncher, diff --git a/data-processing-lib/doc/transform-tutorials.md b/data-processing-lib/doc/transform-tutorials.md index b8df4e5cb..f4e9895d4 100644 --- a/data-processing-lib/doc/transform-tutorials.md +++ b/data-processing-lib/doc/transform-tutorials.md @@ -4,7 +4,7 @@ All transforms operate on a [pyarrow Table](https://arrow.apache.org/docs/python and produce zero or more transformed tables and transform specific metadata. The Transform itself need only be concerned with the conversion of one in memory table at a time. -When running in the Ray worker (i.e. [TransformTableProcessor](../src/data_processing/launch/ray/transform_table_processor.py) ), the input +When running in the Ray worker (i.e. [TransformTableProcessor](../src/data_processing/runtime/ray/transform_table_processor.py) ), the input tables are read from parquet files and the transform table(s) is/are stored in destination parquet files. Metadata accumulated across calls to all transforms is stored in the destination. @@ -39,30 +39,30 @@ not need this feature, a default implementation is provided to return an empty l ### Running in Ray When a transform is run using the Ray-based framework a number of other capabilities are involved: -* [Transform Runtime](../src/data_processing/launch/ray/transform_runtime.py) - this provides the ability for the +* [Transform Runtime](../src/data_processing/runtime/ray/transform_runtime.py) - this provides the ability for the transform implementor to create additional Ray resources and include them in the configuration used to create a transform (see, for example, [exact dedup](../../transforms/universal/ededup/src/ededup_transform.py) or [blocklist](../../transforms/universal/blocklisting/src/blocklist_transform.py)). This also provide the ability to supplement the statics collected by -[Statistics](../src/data_processing/launch/ray/transform_statistics.py) (see below). -* [Transform Configuration](../src/data_processing/launch/ray/transform_runtime.py) - defines the following: +[Statistics](../src/data_processing/runtime/ray/transform_statistics.py) (see below). +* [Transform Configuration](../src/data_processing/runtime/ray/transform_runtime.py) - defines the following: * the transform class to be used, * command line arguments used to initialize the Transform Runtime and generally, the Transform. * Transform Runtime class to use * transform short name -* [Transform Launcher](../src/data_processing/launch/ray/transform_launcher.py) - this is a class generally used to +* [Transform Launcher](../src/data_processing/runtime/ray/transform_launcher.py) - this is a class generally used to implement `main()` that makes use of a Transform Configuration to start the Ray runtime and execute the transforms. Roughly speaking the following steps are completed to establish transforms in the RayWorkers 1. Launcher parses the CLI parameters using an ArgumentParser configured with its own CLI parameters along with those of the Transform Configuration, -2. Launcher passes the Transform Configuration and CLI parameters to the [RayOrchestrator](../src/data_processing/launch/ray/transform_orchestrator.py) +2. Launcher passes the Transform Configuration and CLI parameters to the [RayOrchestrator](../src/data_processing/runtime/ray/transform_orchestrator.py) 3. RayOrchestrator creates the Transform Runtime using the Transform Configuration and its CLI parameter values 4. Transform Runtime creates transform initialization/configuration including the CLI parameters, and any Ray components need by the transform. -5. [RayWorker](../src/data_processing/launch/ray/transform_table_processor.py) is started with configuration from the Transform Runtime. +5. [RayWorker](../src/data_processing/runtime/ray/transform_table_processor.py) is started with configuration from the Transform Runtime. 6. RayWorker creates the Transform using the configuration provided by the Transform Runtime. 7. Statistics is used to collect the statistics submitted by the individual transform, that is used for building execution metadata. @@ -70,16 +70,16 @@ is used for building execution metadata. ![Processing Architecture](processing-architecture.jpg) #### Transform Launcher -The [TransformLauncher](../src/data_processing/launch/ray/transform_launcher.py) uses the Transform Configuration +The [TransformLauncher](../src/data_processing/runtime/ray/transform_launcher.py) uses the Transform Configuration and provides a single method, `launch()`, that kicks off the Ray environment and transform execution coordinated -by [orchestrator](../src/data_processing/launch/ray/transform_orchestrator.py). +by [orchestrator](../src/data_processing/runtime/ray/transform_orchestrator.py). For example, ```python launcher = TransformLauncher(MyTransformConfiguration()) launcher.launch() ``` Note that the launcher defines some additional CLI parameters that are used to control the operation of the -[orchestrator and workers](../src/data_processing/launch/ray/transform_orchestrator_configuration.py) and +[orchestrator and workers](../src/data_processing/runtime/ray/transform_orchestrator_configuration.py) and [data access](../src/data_processing/data_access/data_access_factory.py). Things such as data access configuration, number of workers, worker resources, etc. Discussion of these options is beyond the scope of this document @@ -87,7 +87,7 @@ Discussion of these options is beyond the scope of this document #### Transform Configuration The -[DefaultTableTransformConfiguration](../src/data_processing/launch/ray/transform_runtime.py) +[DefaultTableTransformConfiguration](../src/data_processing/runtime/ray/transform_runtime.py) class is sub-classed and initialized with transform-specific name, and implementation and runtime classes. In addition, it is responsible for providing transform-specific methods to define and filter optional command line arguments. @@ -112,7 +112,7 @@ Details are covered in the samples below. #### Transform Runtime The -[DefaultTableTransformRuntime](../src/data_processing/launch/ray/transform_runtime.py) +[DefaultTableTransformRuntime](../src/data_processing/runtime/ray/transform_runtime.py) class is provided and will be sufficient for many use cases, especially 1:1 table transformation. However, some transforms will require use of the Ray environment, for example, diff --git a/data-processing-lib/src/data_processing/launch/__init__.py b/data-processing-lib/src/data_processing/launch/__init__.py deleted file mode 100644 index d849e4dea..000000000 --- a/data-processing-lib/src/data_processing/launch/__init__.py +++ /dev/null @@ -1,2 +0,0 @@ -from data_processing.launch.execution_configuration import TransformExecutionConfiguration -from data_processing.launch.transform_launcher import AbstractTransformLauncher diff --git a/data-processing-lib/src/data_processing/launch/pure_python/__init__.py b/data-processing-lib/src/data_processing/launch/pure_python/__init__.py deleted file mode 100644 index d51b9b53f..000000000 --- a/data-processing-lib/src/data_processing/launch/pure_python/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -from data_processing.launch.pure_python.python_launcher_configuration import PythonLauncherConfiguration -from data_processing.launch.pure_python.transform_table_processor import TransformTableProcessor -from data_processing.launch.pure_python.transform_orchestrator import orchestrate -from data_processing.launch.pure_python.transform_launcher import PythonTransformLauncher \ No newline at end of file diff --git a/data-processing-lib/src/data_processing/launch/ray/__init__.py b/data-processing-lib/src/data_processing/launch/ray/__init__.py deleted file mode 100644 index 07a6cdd10..000000000 --- a/data-processing-lib/src/data_processing/launch/ray/__init__.py +++ /dev/null @@ -1,8 +0,0 @@ -from data_processing.launch.ray.ray_utils import RayUtils -from data_processing.launch.ray.transform_statistics import TransformStatisticsRay -from data_processing.launch.ray.transform_table_processor import TransformTableProcessorRay -from data_processing.launch.ray.transform_runtime import DefaultTableTransformRuntimeRay -from data_processing.launch.ray.transform_launch_configuration import RayLauncherConfiguration -from data_processing.launch.ray.transform_orchestrator_configuration import TransformOrchestratorConfiguration -from data_processing.launch.ray.transform_orchestrator import orchestrate -from data_processing.launch.ray.transform_launcher import RayTransformLauncher diff --git a/data-processing-lib/src/data_processing/runtime/__init__.py b/data-processing-lib/src/data_processing/runtime/__init__.py new file mode 100644 index 000000000..3a19567d8 --- /dev/null +++ b/data-processing-lib/src/data_processing/runtime/__init__.py @@ -0,0 +1,2 @@ +from data_processing.runtime.execution_configuration import TransformExecutionConfiguration +from data_processing.runtime.transform_launcher import AbstractTransformLauncher diff --git a/data-processing-lib/src/data_processing/launch/execution_configuration.py b/data-processing-lib/src/data_processing/runtime/execution_configuration.py similarity index 100% rename from data-processing-lib/src/data_processing/launch/execution_configuration.py rename to data-processing-lib/src/data_processing/runtime/execution_configuration.py diff --git a/data-processing-lib/src/data_processing/runtime/pure_python/__init__.py b/data-processing-lib/src/data_processing/runtime/pure_python/__init__.py new file mode 100644 index 000000000..3cc9cd76e --- /dev/null +++ b/data-processing-lib/src/data_processing/runtime/pure_python/__init__.py @@ -0,0 +1,4 @@ +from data_processing.runtime.pure_python.python_launcher_configuration import PythonLauncherConfiguration +from data_processing.runtime.pure_python.transform_table_processor import TransformTableProcessor +from data_processing.runtime.pure_python.transform_orchestrator import orchestrate +from data_processing.runtime.pure_python.transform_launcher import PythonTransformLauncher diff --git a/data-processing-lib/src/data_processing/launch/pure_python/python_launcher_configuration.py b/data-processing-lib/src/data_processing/runtime/pure_python/python_launcher_configuration.py similarity index 95% rename from data-processing-lib/src/data_processing/launch/pure_python/python_launcher_configuration.py rename to data-processing-lib/src/data_processing/runtime/pure_python/python_launcher_configuration.py index 2686ff244..631d1ae46 100644 --- a/data-processing-lib/src/data_processing/launch/pure_python/python_launcher_configuration.py +++ b/data-processing-lib/src/data_processing/runtime/pure_python/python_launcher_configuration.py @@ -9,11 +9,10 @@ # See the License for the specific language governing permissions and # limitations under the License. ################################################################################ -from typing import Any from argparse import ArgumentParser, Namespace +from typing import Any -from data_processing.transform import TransformConfiguration -from data_processing.transform import AbstractTableTransform +from data_processing.transform import AbstractTableTransform, TransformConfiguration from data_processing.utils import CLIArgumentProvider @@ -28,7 +27,8 @@ class PythonLauncherConfiguration(CLIArgumentProvider): """ def __init__( - self, transform_configuration: TransformConfiguration, + self, + transform_configuration: TransformConfiguration, ): """ Initialization diff --git a/data-processing-lib/src/data_processing/launch/pure_python/transform_launcher.py b/data-processing-lib/src/data_processing/runtime/pure_python/transform_launcher.py similarity index 91% rename from data-processing-lib/src/data_processing/launch/pure_python/transform_launcher.py rename to data-processing-lib/src/data_processing/runtime/pure_python/transform_launcher.py index 5513b1803..7b77a43e5 100644 --- a/data-processing-lib/src/data_processing/launch/pure_python/transform_launcher.py +++ b/data-processing-lib/src/data_processing/runtime/pure_python/transform_launcher.py @@ -14,12 +14,10 @@ import time from data_processing.data_access import DataAccessFactory, DataAccessFactoryBase -from data_processing.launch import TransformExecutionConfiguration +from data_processing.runtime import TransformExecutionConfiguration +from data_processing.runtime.pure_python import PythonLauncherConfiguration, orchestrate +from data_processing.runtime.transform_launcher import AbstractTransformLauncher from data_processing.transform import TransformConfiguration -from data_processing.launch.pure_python import orchestrate -from data_processing.launch.transform_launcher import AbstractTransformLauncher - -from data_processing.launch.pure_python import PythonLauncherConfiguration from data_processing.utils import get_logger @@ -33,7 +31,7 @@ class PythonTransformLauncher(AbstractTransformLauncher): def __init__( self, - # transform_runtime_config: PythonLauncherConfiguration, + # transform_runtime_config: PythonLauncherConfiguration, transform_config: TransformConfiguration, data_access_factory: DataAccessFactoryBase = DataAccessFactory(), ): diff --git a/data-processing-lib/src/data_processing/launch/pure_python/transform_orchestrator.py b/data-processing-lib/src/data_processing/runtime/pure_python/transform_orchestrator.py similarity index 93% rename from data-processing-lib/src/data_processing/launch/pure_python/transform_orchestrator.py rename to data-processing-lib/src/data_processing/runtime/pure_python/transform_orchestrator.py index e577098f5..fadf8a473 100644 --- a/data-processing-lib/src/data_processing/launch/pure_python/transform_orchestrator.py +++ b/data-processing-lib/src/data_processing/runtime/pure_python/transform_orchestrator.py @@ -15,12 +15,12 @@ from datetime import datetime from data_processing.data_access import DataAccessFactoryBase -from data_processing.launch import TransformExecutionConfiguration -from data_processing.launch.pure_python import TransformTableProcessor -from data_processing.transform import ( - TransformStatistics, +from data_processing.runtime import TransformExecutionConfiguration +from data_processing.runtime.pure_python import ( + PythonLauncherConfiguration, + TransformTableProcessor, ) -from data_processing.launch.pure_python import PythonLauncherConfiguration +from data_processing.transform import TransformStatistics from data_processing.utils import get_logger diff --git a/data-processing-lib/src/data_processing/launch/pure_python/transform_table_processor.py b/data-processing-lib/src/data_processing/runtime/pure_python/transform_table_processor.py similarity index 99% rename from data-processing-lib/src/data_processing/launch/pure_python/transform_table_processor.py rename to data-processing-lib/src/data_processing/runtime/pure_python/transform_table_processor.py index 4fe99c420..9f66be7b1 100644 --- a/data-processing-lib/src/data_processing/launch/pure_python/transform_table_processor.py +++ b/data-processing-lib/src/data_processing/runtime/pure_python/transform_table_processor.py @@ -16,9 +16,10 @@ import pyarrow as pa from data_processing.data_access import DataAccessFactoryBase +from data_processing.runtime.pure_python import PythonLauncherConfiguration from data_processing.transform import TransformStatistics from data_processing.utils import TransformUtils, get_logger -from data_processing.launch.pure_python import PythonLauncherConfiguration + logger = get_logger(__name__) diff --git a/data-processing-lib/src/data_processing/runtime/ray/__init__.py b/data-processing-lib/src/data_processing/runtime/ray/__init__.py new file mode 100644 index 000000000..bfe4edf9e --- /dev/null +++ b/data-processing-lib/src/data_processing/runtime/ray/__init__.py @@ -0,0 +1,8 @@ +from data_processing.runtime.ray.ray_utils import RayUtils +from data_processing.runtime.ray.transform_statistics import TransformStatisticsRay +from data_processing.runtime.ray.transform_table_processor import TransformTableProcessorRay +from data_processing.runtime.ray.transform_runtime import DefaultTableTransformRuntimeRay +from data_processing.runtime.ray.transform_launch_configuration import RayLauncherConfiguration +from data_processing.runtime.ray.transform_orchestrator_configuration import TransformOrchestratorConfiguration +from data_processing.runtime.ray.transform_orchestrator import orchestrate +from data_processing.runtime.ray.transform_launcher import RayTransformLauncher diff --git a/data-processing-lib/src/data_processing/launch/ray/ray_utils.py b/data-processing-lib/src/data_processing/runtime/ray/ray_utils.py similarity index 100% rename from data-processing-lib/src/data_processing/launch/ray/ray_utils.py rename to data-processing-lib/src/data_processing/runtime/ray/ray_utils.py diff --git a/data-processing-lib/src/data_processing/launch/ray/transform_configuration.py b/data-processing-lib/src/data_processing/runtime/ray/transform_configuration.py similarity index 94% rename from data-processing-lib/src/data_processing/launch/ray/transform_configuration.py rename to data-processing-lib/src/data_processing/runtime/ray/transform_configuration.py index 2fb68dd67..549aaa6fc 100644 --- a/data-processing-lib/src/data_processing/launch/ray/transform_configuration.py +++ b/data-processing-lib/src/data_processing/runtime/ray/transform_configuration.py @@ -1,4 +1,4 @@ -from data_processing.launch.ray import DefaultTableTransformRuntimeRay +from data_processing.runtime.ray import DefaultTableTransformRuntimeRay from data_processing.transform.transform_configuration import ( TransformConfiguration, TransformConfigurationProxy, diff --git a/data-processing-lib/src/data_processing/launch/ray/transform_launch_configuration.py b/data-processing-lib/src/data_processing/runtime/ray/transform_launch_configuration.py similarity index 90% rename from data-processing-lib/src/data_processing/launch/ray/transform_launch_configuration.py rename to data-processing-lib/src/data_processing/runtime/ray/transform_launch_configuration.py index 916c500cf..a38544dbc 100644 --- a/data-processing-lib/src/data_processing/launch/ray/transform_launch_configuration.py +++ b/data-processing-lib/src/data_processing/runtime/ray/transform_launch_configuration.py @@ -1,7 +1,6 @@ -from data_processing.launch.pure_python import PythonLauncherConfiguration +from data_processing.runtime.pure_python import PythonLauncherConfiguration +from data_processing.runtime.ray import DefaultTableTransformRuntimeRay from data_processing.transform import TransformConfiguration -from data_processing.launch.ray import DefaultTableTransformRuntimeRay - class RayLauncherConfiguration(PythonLauncherConfiguration): @@ -42,4 +41,4 @@ def create_transform_runtime(self) -> DefaultTableTransformRuntimeRay: Create transform runtime with the parameters captured during apply_input_params() :return: transform runtime object """ - return self.runtime_class(self.params) \ No newline at end of file + return self.runtime_class(self.params) diff --git a/data-processing-lib/src/data_processing/launch/ray/transform_launcher.py b/data-processing-lib/src/data_processing/runtime/ray/transform_launcher.py similarity index 95% rename from data-processing-lib/src/data_processing/launch/ray/transform_launcher.py rename to data-processing-lib/src/data_processing/runtime/ray/transform_launcher.py index fcbaf5d40..924b6f0dd 100644 --- a/data-processing-lib/src/data_processing/launch/ray/transform_launcher.py +++ b/data-processing-lib/src/data_processing/runtime/ray/transform_launcher.py @@ -16,14 +16,16 @@ import ray from data_processing.data_access import DataAccessFactory, DataAccessFactoryBase -from data_processing.launch.ray import ( +from data_processing.runtime.ray import ( DefaultTableTransformRuntimeRay, RayLauncherConfiguration, TransformOrchestratorConfiguration, orchestrate, ) -from data_processing.launch.ray.transform_configuration import RayTransformConfiguration -from data_processing.launch.transform_launcher import AbstractTransformLauncher +from data_processing.runtime.ray.transform_configuration import ( + RayTransformConfiguration, +) +from data_processing.runtime.transform_launcher import AbstractTransformLauncher from data_processing.transform import TransformConfiguration from data_processing.utils import get_logger, str2bool diff --git a/data-processing-lib/src/data_processing/launch/ray/transform_orchestrator.py b/data-processing-lib/src/data_processing/runtime/ray/transform_orchestrator.py similarity index 99% rename from data-processing-lib/src/data_processing/launch/ray/transform_orchestrator.py rename to data-processing-lib/src/data_processing/runtime/ray/transform_orchestrator.py index 24272a9bf..d7b4bc37f 100644 --- a/data-processing-lib/src/data_processing/launch/ray/transform_orchestrator.py +++ b/data-processing-lib/src/data_processing/runtime/ray/transform_orchestrator.py @@ -16,9 +16,9 @@ import ray from data_processing.data_access import DataAccessFactoryBase -from data_processing.launch.ray import ( - RayUtils, +from data_processing.runtime.ray import ( RayLauncherConfiguration, + RayUtils, TransformOrchestratorConfiguration, TransformStatisticsRay, TransformTableProcessorRay, diff --git a/data-processing-lib/src/data_processing/launch/ray/transform_orchestrator_configuration.py b/data-processing-lib/src/data_processing/runtime/ray/transform_orchestrator_configuration.py similarity index 98% rename from data-processing-lib/src/data_processing/launch/ray/transform_orchestrator_configuration.py rename to data-processing-lib/src/data_processing/runtime/ray/transform_orchestrator_configuration.py index a67a308f0..b66f536bb 100644 --- a/data-processing-lib/src/data_processing/launch/ray/transform_orchestrator_configuration.py +++ b/data-processing-lib/src/data_processing/runtime/ray/transform_orchestrator_configuration.py @@ -14,7 +14,7 @@ import ast from typing import Any -from data_processing.launch import TransformExecutionConfiguration +from data_processing.runtime import TransformExecutionConfiguration from data_processing.utils import CLIArgumentProvider, ParamsUtils, get_logger diff --git a/data-processing-lib/src/data_processing/launch/ray/transform_runtime.py b/data-processing-lib/src/data_processing/runtime/ray/transform_runtime.py similarity index 99% rename from data-processing-lib/src/data_processing/launch/ray/transform_runtime.py rename to data-processing-lib/src/data_processing/runtime/ray/transform_runtime.py index a2e2771e5..acac0c8a6 100644 --- a/data-processing-lib/src/data_processing/launch/ray/transform_runtime.py +++ b/data-processing-lib/src/data_processing/runtime/ray/transform_runtime.py @@ -51,6 +51,3 @@ def compute_execution_stats(self, stats: dict[str, Any]) -> dict[str, Any]: :return: job execution statistics. These are generally reported as metadata by the Ray Orchestrator. """ return stats - - - diff --git a/data-processing-lib/src/data_processing/launch/ray/transform_statistics.py b/data-processing-lib/src/data_processing/runtime/ray/transform_statistics.py similarity index 100% rename from data-processing-lib/src/data_processing/launch/ray/transform_statistics.py rename to data-processing-lib/src/data_processing/runtime/ray/transform_statistics.py diff --git a/data-processing-lib/src/data_processing/launch/ray/transform_table_processor.py b/data-processing-lib/src/data_processing/runtime/ray/transform_table_processor.py similarity index 100% rename from data-processing-lib/src/data_processing/launch/ray/transform_table_processor.py rename to data-processing-lib/src/data_processing/runtime/ray/transform_table_processor.py diff --git a/data-processing-lib/src/data_processing/launch/transform_launcher.py b/data-processing-lib/src/data_processing/runtime/transform_launcher.py similarity index 76% rename from data-processing-lib/src/data_processing/launch/transform_launcher.py rename to data-processing-lib/src/data_processing/runtime/transform_launcher.py index 43ac603f7..0c3d85459 100644 --- a/data-processing-lib/src/data_processing/launch/transform_launcher.py +++ b/data-processing-lib/src/data_processing/runtime/transform_launcher.py @@ -1,11 +1,11 @@ -from data_processing.data_access import DataAccessFactoryBase, DataAccessFactory +from data_processing.data_access import DataAccessFactory, DataAccessFactoryBase from data_processing.transform import TransformConfiguration class AbstractTransformLauncher: def __init__( self, - # transform_runtime_config: PythonLauncherConfiguration, + # transform_runtime_config: PythonLauncherConfiguration, transform_config: TransformConfiguration, data_access_factory: DataAccessFactoryBase = DataAccessFactory(), ): @@ -19,7 +19,7 @@ def __init__( self.data_access_factory = data_access_factory def launch(self): - raise ValueError("must be implemented by subclass"); + raise ValueError("must be implemented by subclass") def get_transform_name(self) -> str: return self.name diff --git a/data-processing-lib/src/data_processing/test_support/launch/transform_test.py b/data-processing-lib/src/data_processing/test_support/launch/transform_test.py index 554f18775..6dcee5b97 100644 --- a/data-processing-lib/src/data_processing/test_support/launch/transform_test.py +++ b/data-processing-lib/src/data_processing/test_support/launch/transform_test.py @@ -14,8 +14,8 @@ import tempfile from typing import Any -from data_processing.launch.ray import RayTransformLauncher -from data_processing.launch.transform_launcher import AbstractTransformLauncher +from data_processing.runtime.ray import RayTransformLauncher +from data_processing.runtime.transform_launcher import AbstractTransformLauncher from data_processing.test_support.abstract_test import AbstractTest from data_processing.utils import ParamsUtils @@ -30,11 +30,13 @@ class AbstractTransformLauncherTest(AbstractTest): """ @staticmethod - def _get_argv(launcher:AbstractTransformLauncher, cli_params: dict[str, Any], in_table_path: str, out_table_path: str): + def _get_argv( + launcher: AbstractTransformLauncher, cli_params: dict[str, Any], in_table_path: str, out_table_path: str + ): args = {} | cli_params local_ast = {"input_folder": in_table_path, "output_folder": out_table_path} args["data_local_config"] = local_ast - if isinstance(launcher,RayTransformLauncher): + if isinstance(launcher, RayTransformLauncher): args["run_locally"] = "True" argv = ParamsUtils.dict_to_req(args) return argv diff --git a/data-processing-lib/src/data_processing/test_support/transform/noop_transform.py b/data-processing-lib/src/data_processing/test_support/transform/noop_transform.py index 6c5e57329..c7c8e9cf7 100644 --- a/data-processing-lib/src/data_processing/test_support/transform/noop_transform.py +++ b/data-processing-lib/src/data_processing/test_support/transform/noop_transform.py @@ -15,8 +15,10 @@ from typing import Any import pyarrow as pa -from data_processing.launch.pure_python import PythonTransformLauncher -from data_processing.launch.ray.transform_configuration import RayTransformConfiguration +from data_processing.runtime.pure_python import PythonTransformLauncher +from data_processing.runtime.ray.transform_configuration import ( + RayTransformConfiguration, +) from data_processing.transform import AbstractTableTransform, TransformConfiguration from data_processing.utils import CLIArgumentProvider, get_logger diff --git a/data-processing-lib/test/data_processing_tests/launch/pure_python/launcher_test.py b/data-processing-lib/test/data_processing_tests/launch/pure_python/launcher_test.py index 82b698415..3a874d0b7 100644 --- a/data-processing-lib/test/data_processing_tests/launch/pure_python/launcher_test.py +++ b/data-processing-lib/test/data_processing_tests/launch/pure_python/launcher_test.py @@ -13,9 +13,11 @@ import os import sys -from data_processing.transform import TransformConfiguration -from data_processing.launch.pure_python import PythonTransformLauncher, PythonLauncherConfiguration -from data_processing.transform import AbstractTableTransform +from data_processing.runtime.pure_python import ( + PythonLauncherConfiguration, + PythonTransformLauncher, +) +from data_processing.transform import AbstractTableTransform, TransformConfiguration from data_processing.utils import ParamsUtils @@ -39,9 +41,11 @@ code_location = {"github": "github", "commit_hash": "12345", "path": "path"} + class TestingTransformConfiguration(TransformConfiguration): def __init__(self): - super().__init__("test", transform_class = AbstractTableTransform) + super().__init__("test", transform_class=AbstractTableTransform) + class TestLauncherPython(PythonTransformLauncher): """ diff --git a/data-processing-lib/test/data_processing_tests/launch/ray/launcher_test.py b/data-processing-lib/test/data_processing_tests/launch/ray/launcher_test.py index addfbf936..51108e078 100644 --- a/data-processing-lib/test/data_processing_tests/launch/ray/launcher_test.py +++ b/data-processing-lib/test/data_processing_tests/launch/ray/launcher_test.py @@ -13,8 +13,10 @@ import os import sys -from data_processing.launch.ray import RayTransformLauncher -from data_processing.launch.ray.transform_configuration import RayTransformConfiguration +from data_processing.runtime.ray import RayTransformLauncher +from data_processing.runtime.ray.transform_configuration import ( + RayTransformConfiguration, +) from data_processing.test_support.transform import NOOPTransformConfiguration from data_processing.test_support.transform.noop_transform import ( NOOPRayTransformConfiguration, diff --git a/data-processing-lib/test/data_processing_tests/launch/ray/ray_util_test.py b/data-processing-lib/test/data_processing_tests/launch/ray/ray_util_test.py index c22436eb5..29dcd5f78 100644 --- a/data-processing-lib/test/data_processing_tests/launch/ray/ray_util_test.py +++ b/data-processing-lib/test/data_processing_tests/launch/ray/ray_util_test.py @@ -12,7 +12,7 @@ import pyarrow as pa import ray -from data_processing.launch.ray import RayUtils, TransformStatisticsRay +from data_processing.runtime.ray import RayUtils, TransformStatisticsRay from data_processing.utils import GB, TransformUtils diff --git a/data-processing-lib/test/data_processing_tests/launch/ray/test_noop_launch.py b/data-processing-lib/test/data_processing_tests/launch/ray/test_noop_launch.py index f92cee0eb..d950d352d 100644 --- a/data-processing-lib/test/data_processing_tests/launch/ray/test_noop_launch.py +++ b/data-processing-lib/test/data_processing_tests/launch/ray/test_noop_launch.py @@ -13,7 +13,7 @@ import os import pyarrow as pa -from data_processing.launch.ray import RayTransformLauncher +from data_processing.runtime.ray import RayTransformLauncher from data_processing.test_support.launch.transform_test import ( AbstractTransformLauncherTest, ) diff --git a/transforms/code/code_quality/src/code_quality_local_pure.py b/transforms/code/code_quality/src/code_quality_local_pure.py index 884fef94d..20b409a33 100644 --- a/transforms/code/code_quality/src/code_quality_local_pure.py +++ b/transforms/code/code_quality/src/code_quality_local_pure.py @@ -15,7 +15,7 @@ from pathlib import Path from code_quality_transform import CodeQualityTransformConfiguration -from data_processing.launch.pure_python import PythonTransformLauncher +from data_processing.runtime.pure_python import PythonTransformLauncher from data_processing.utils import ParamsUtils diff --git a/transforms/code/code_quality/src/code_quality_local_ray.py b/transforms/code/code_quality/src/code_quality_local_ray.py index 48eed9ad8..312c54bc9 100644 --- a/transforms/code/code_quality/src/code_quality_local_ray.py +++ b/transforms/code/code_quality/src/code_quality_local_ray.py @@ -15,7 +15,7 @@ from pathlib import Path from code_quality_transform import CodeQualityRayTransformConfiguration -from data_processing.launch.ray import RayTransformLauncher +from data_processing.runtime.ray import RayTransformLauncher from data_processing.utils import ParamsUtils diff --git a/transforms/code/code_quality/src/code_quality_s3_ray.py b/transforms/code/code_quality/src/code_quality_s3_ray.py index 1954a37c0..56354c469 100644 --- a/transforms/code/code_quality/src/code_quality_s3_ray.py +++ b/transforms/code/code_quality/src/code_quality_s3_ray.py @@ -13,7 +13,7 @@ import sys from code_quality_transform import CodeQualityRayTransformConfiguration -from data_processing.launch.ray import RayTransformLauncher +from data_processing.runtime.ray import RayTransformLauncher from data_processing.utils import ParamsUtils diff --git a/transforms/code/code_quality/src/code_quality_transform.py b/transforms/code/code_quality/src/code_quality_transform.py index f7cc53e83..3ab68630f 100644 --- a/transforms/code/code_quality/src/code_quality_transform.py +++ b/transforms/code/code_quality/src/code_quality_transform.py @@ -24,8 +24,10 @@ import numpy as np import pyarrow as pa from bs4 import BeautifulSoup -from data_processing.launch.ray import RayTransformLauncher -from data_processing.launch.ray.transform_configuration import RayTransformConfiguration +from data_processing.runtime.ray import RayTransformLauncher +from data_processing.runtime.ray.transform_configuration import ( + RayTransformConfiguration, +) from data_processing.transform import AbstractTableTransform, TransformConfiguration from data_processing.utils import TransformUtils from transformers import AutoTokenizer diff --git a/transforms/code/code_quality/test/test_code_quality_launcher.py b/transforms/code/code_quality/test/test_code_quality_launcher.py index 5a9e16361..b056d0adc 100644 --- a/transforms/code/code_quality/test/test_code_quality_launcher.py +++ b/transforms/code/code_quality/test/test_code_quality_launcher.py @@ -13,7 +13,7 @@ import os from code_quality_transform import CodeQualityRayTransformConfiguration -from data_processing.launch.ray import RayTransformLauncher +from data_processing.runtime.ray import RayTransformLauncher from data_processing.test_support.launch.transform_test import ( AbstractTransformLauncherTest, ) diff --git a/transforms/code/malware/src/malware_local_pure.py b/transforms/code/malware/src/malware_local_pure.py index b94defa6b..5596e7f1f 100644 --- a/transforms/code/malware/src/malware_local_pure.py +++ b/transforms/code/malware/src/malware_local_pure.py @@ -14,9 +14,10 @@ import sys from pathlib import Path -from data_processing.launch.pure_python import PythonTransformLauncher +from data_processing.runtime.pure_python import PythonTransformLauncher from data_processing.utils import ParamsUtils -from malware_transform import check_clamd, MalwareTransformConfiguration +from malware_transform import MalwareTransformConfiguration, check_clamd + TEST_SOCKET = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", ".tmp", "clamd.ctl")) # create parameters diff --git a/transforms/code/malware/src/malware_local_ray.py b/transforms/code/malware/src/malware_local_ray.py index c2a16cd0b..6a9c5a28a 100644 --- a/transforms/code/malware/src/malware_local_ray.py +++ b/transforms/code/malware/src/malware_local_ray.py @@ -14,7 +14,7 @@ import sys from pathlib import Path -from data_processing.launch.ray import RayTransformLauncher +from data_processing.runtime.ray import RayTransformLauncher from data_processing.utils import ParamsUtils from malware_transform import MalwareRayTransformConfiguration, check_clamd diff --git a/transforms/code/malware/src/malware_transform.py b/transforms/code/malware/src/malware_transform.py index 8802ffe42..9356f9d3a 100644 --- a/transforms/code/malware/src/malware_transform.py +++ b/transforms/code/malware/src/malware_transform.py @@ -18,12 +18,14 @@ import clamd import pyarrow as pa -from data_processing.launch.pure_python import ( +from data_processing.runtime.pure_python import ( PythonLauncherConfiguration, PythonTransformLauncher, ) -from data_processing.launch.ray import RayTransformLauncher -from data_processing.launch.ray.transform_configuration import RayTransformConfiguration +from data_processing.runtime.ray import RayTransformLauncher +from data_processing.runtime.ray.transform_configuration import ( + RayTransformConfiguration, +) from data_processing.transform import AbstractTableTransform, TransformConfiguration from data_processing.utils import get_logger from data_processing.utils.transform_utils import TransformUtils diff --git a/transforms/code/malware/test/test_malware_ray.py b/transforms/code/malware/test/test_malware_ray.py index 903f2999f..f9fff8d5a 100644 --- a/transforms/code/malware/test/test_malware_ray.py +++ b/transforms/code/malware/test/test_malware_ray.py @@ -15,8 +15,10 @@ import os -from data_processing.launch.ray import RayTransformLauncher -from data_processing.launch.ray.transform_configuration import RayTransformConfiguration +from data_processing.runtime.ray import RayTransformLauncher +from data_processing.runtime.ray.transform_configuration import ( + RayTransformConfiguration, +) from data_processing.test_support.launch.transform_test import ( AbstractTransformLauncherTest, ) diff --git a/transforms/code/proglang_select/src/proglang_select_local_pure.py b/transforms/code/proglang_select/src/proglang_select_local_pure.py index b087c7b9c..124cd8690 100644 --- a/transforms/code/proglang_select/src/proglang_select_local_pure.py +++ b/transforms/code/proglang_select/src/proglang_select_local_pure.py @@ -13,12 +13,14 @@ import os import sys -from data_processing.launch.pure_python import PythonTransformLauncher +from data_processing.runtime.pure_python import PythonTransformLauncher from data_processing.utils import ParamsUtils from proglang_select_transform import ( + ProgLangSelectRuntime, + ProgLangSelectTransformConfiguration, lang_allowed_langs_file_key, lang_lang_column_key, - lang_output_column_key, ProgLangSelectTransformConfiguration, ProgLangSelectRuntime, + lang_output_column_key, ) diff --git a/transforms/code/proglang_select/src/proglang_select_local_ray.py b/transforms/code/proglang_select/src/proglang_select_local_ray.py index 37fc65a8f..33fc963a9 100644 --- a/transforms/code/proglang_select/src/proglang_select_local_ray.py +++ b/transforms/code/proglang_select/src/proglang_select_local_ray.py @@ -13,7 +13,7 @@ import os import sys -from data_processing.launch.ray import RayTransformLauncher +from data_processing.runtime.ray import RayTransformLauncher from data_processing.utils import ParamsUtils from proglang_select_transform import ( ProgLangSelectRayConfiguration, diff --git a/transforms/code/proglang_select/src/proglang_select_transform.py b/transforms/code/proglang_select/src/proglang_select_transform.py index b5dc565ca..f49137c59 100644 --- a/transforms/code/proglang_select/src/proglang_select_transform.py +++ b/transforms/code/proglang_select/src/proglang_select_transform.py @@ -20,15 +20,17 @@ DataAccessFactory, DataAccessFactoryBase, ) -from data_processing.launch.pure_python import ( +from data_processing.runtime.pure_python import ( PythonLauncherConfiguration, PythonTransformLauncher, ) -from data_processing.launch.ray import ( +from data_processing.runtime.ray import ( DefaultTableTransformRuntimeRay, RayTransformLauncher, ) -from data_processing.launch.ray.transform_configuration import RayTransformConfiguration +from data_processing.runtime.ray.transform_configuration import ( + RayTransformConfiguration, +) from data_processing.transform import AbstractTableTransform, TransformConfiguration from data_processing.utils import TransformUtils, get_logger from ray.actor import ActorHandle diff --git a/transforms/code/proglang_select/test/test_proglang_select_ray.py b/transforms/code/proglang_select/test/test_proglang_select_ray.py index 5f71aee90..b72be0452 100644 --- a/transforms/code/proglang_select/test/test_proglang_select_ray.py +++ b/transforms/code/proglang_select/test/test_proglang_select_ray.py @@ -12,7 +12,7 @@ import os -from data_processing.launch.ray import RayTransformLauncher +from data_processing.runtime.ray import RayTransformLauncher from data_processing.test_support.launch.transform_test import ( AbstractTransformLauncherTest, ) diff --git a/transforms/universal/doc_id/src/doc_id_local_pure.py b/transforms/universal/doc_id/src/doc_id_local_pure.py index 0b9b92ec4..6a1a26d08 100644 --- a/transforms/universal/doc_id/src/doc_id_local_pure.py +++ b/transforms/universal/doc_id/src/doc_id_local_pure.py @@ -13,10 +13,11 @@ import os import sys -from data_processing.launch.pure_python import PythonTransformLauncher +from data_processing.runtime.pure_python import PythonTransformLauncher from data_processing.utils import ParamsUtils from doc_id_transform import DocIDTransformConfiguration + # create parameters input_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "test-data", "input")) output_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "output")) diff --git a/transforms/universal/doc_id/src/doc_id_local_ray.py b/transforms/universal/doc_id/src/doc_id_local_ray.py index 7bd66457c..9ce8b5408 100644 --- a/transforms/universal/doc_id/src/doc_id_local_ray.py +++ b/transforms/universal/doc_id/src/doc_id_local_ray.py @@ -13,7 +13,7 @@ import os import sys -from data_processing.launch.ray import RayTransformLauncher +from data_processing.runtime.ray import RayTransformLauncher from data_processing.utils import ParamsUtils from doc_id_transform import DocIDRayTransformConfiguration diff --git a/transforms/universal/doc_id/src/doc_id_s3_ray.py b/transforms/universal/doc_id/src/doc_id_s3_ray.py index b3158fde7..14d3b0fc4 100644 --- a/transforms/universal/doc_id/src/doc_id_s3_ray.py +++ b/transforms/universal/doc_id/src/doc_id_s3_ray.py @@ -12,7 +12,7 @@ import sys -from data_processing.launch.ray import RayTransformLauncher +from data_processing.runtime.ray import RayTransformLauncher from data_processing.utils import ParamsUtils from doc_id_transform import DocIDRayTransformConfiguration diff --git a/transforms/universal/doc_id/src/doc_id_transform.py b/transforms/universal/doc_id/src/doc_id_transform.py index fb22d3aad..8e67c73a3 100644 --- a/transforms/universal/doc_id/src/doc_id_transform.py +++ b/transforms/universal/doc_id/src/doc_id_transform.py @@ -16,15 +16,17 @@ import pyarrow as pa import ray from data_processing.data_access import DataAccessFactoryBase -from data_processing.launch.pure_python import ( +from data_processing.runtime.pure_python import ( PythonLauncherConfiguration, PythonTransformLauncher, ) -from data_processing.launch.ray import ( +from data_processing.runtime.ray import ( DefaultTableTransformRuntimeRay, RayTransformLauncher, ) -from data_processing.launch.ray.transform_configuration import RayTransformConfiguration +from data_processing.runtime.ray.transform_configuration import ( + RayTransformConfiguration, +) from data_processing.transform import AbstractTableTransform, TransformConfiguration from data_processing.utils import CLIArgumentProvider, TransformUtils, get_logger from ray.actor import ActorHandle diff --git a/transforms/universal/doc_id/test/test_doc_id_ray.py b/transforms/universal/doc_id/test/test_doc_id_ray.py index ed6cace6a..4e82a27e9 100644 --- a/transforms/universal/doc_id/test/test_doc_id_ray.py +++ b/transforms/universal/doc_id/test/test_doc_id_ray.py @@ -12,7 +12,7 @@ import os -from data_processing.launch.ray import RayTransformLauncher +from data_processing.runtime.ray import RayTransformLauncher from data_processing.test_support.launch.transform_test import ( AbstractTransformLauncherTest, ) diff --git a/transforms/universal/ededup/src/ededup_local_ray.py b/transforms/universal/ededup/src/ededup_local_ray.py index a9b78b960..d5ce18c5e 100644 --- a/transforms/universal/ededup/src/ededup_local_ray.py +++ b/transforms/universal/ededup/src/ededup_local_ray.py @@ -13,7 +13,7 @@ import os import sys -from data_processing.launch.ray import RayTransformLauncher +from data_processing.runtime.ray import RayTransformLauncher from data_processing.utils import ParamsUtils from ededup_transform import EdedupRayTransformConfiguration diff --git a/transforms/universal/ededup/src/ededup_s3_ray.py b/transforms/universal/ededup/src/ededup_s3_ray.py index a57f89c90..8ff286687 100644 --- a/transforms/universal/ededup/src/ededup_s3_ray.py +++ b/transforms/universal/ededup/src/ededup_s3_ray.py @@ -12,7 +12,7 @@ import sys -from data_processing.launch.ray import RayTransformLauncher +from data_processing.runtime.ray import RayTransformLauncher from data_processing.utils import ParamsUtils from ededup_transform import EdedupRayTransformConfiguration diff --git a/transforms/universal/ededup/src/ededup_transform.py b/transforms/universal/ededup/src/ededup_transform.py index eded8151f..e044d9a81 100644 --- a/transforms/universal/ededup/src/ededup_transform.py +++ b/transforms/universal/ededup/src/ededup_transform.py @@ -16,12 +16,14 @@ import pyarrow as pa import ray from data_processing.data_access import DataAccessFactoryBase -from data_processing.launch.ray import ( +from data_processing.runtime.ray import ( DefaultTableTransformRuntimeRay, RayTransformLauncher, RayUtils, ) -from data_processing.launch.ray.transform_configuration import RayTransformConfiguration +from data_processing.runtime.ray.transform_configuration import ( + RayTransformConfiguration, +) from data_processing.transform import AbstractTableTransform, TransformConfiguration from data_processing.utils import GB, CLIArgumentProvider, TransformUtils, get_logger from ray.actor import ActorHandle diff --git a/transforms/universal/ededup/test/test_ededup_ray.py b/transforms/universal/ededup/test/test_ededup_ray.py index cf4f2f46c..1a2a9e696 100644 --- a/transforms/universal/ededup/test/test_ededup_ray.py +++ b/transforms/universal/ededup/test/test_ededup_ray.py @@ -12,7 +12,7 @@ import os -from data_processing.launch.ray import RayTransformLauncher +from data_processing.runtime.ray import RayTransformLauncher from data_processing.test_support.launch.transform_test import ( AbstractTransformLauncherTest, ) diff --git a/transforms/universal/fdedup/Readme.md b/transforms/universal/fdedup/Readme.md index 879731e7c..42820d879 100644 --- a/transforms/universal/fdedup/Readme.md +++ b/transforms/universal/fdedup/Readme.md @@ -32,7 +32,7 @@ The main components of runtime are described below [Table processing actor](src/fdedup_transform.py) is implemented following framework itself is implemented as a pair - `FdedupTransform` implementing the actual transformation and and -[transform table processor](../../../data-processing-lib/src/data_processing/launch/ray/transform_table_processor.py) +[transform table processor](../../../data-processing-lib/src/data_processing/runtime/ray/transform_table_processor.py) (from the framework itself). ### DocsMinHash Actor diff --git a/transforms/universal/fdedup/src/fdedup_local_ray.py b/transforms/universal/fdedup/src/fdedup_local_ray.py index d40d1a8c1..cdffabc25 100644 --- a/transforms/universal/fdedup/src/fdedup_local_ray.py +++ b/transforms/universal/fdedup/src/fdedup_local_ray.py @@ -13,7 +13,7 @@ import os import sys -from data_processing.launch.ray import RayTransformLauncher +from data_processing.runtime.ray import RayTransformLauncher from data_processing.utils import ParamsUtils from fdedup_transform import FdedupRayTransformConfiguration diff --git a/transforms/universal/fdedup/src/fdedup_s3_ray.py b/transforms/universal/fdedup/src/fdedup_s3_ray.py index 01bd8f0bf..dad3e8a72 100644 --- a/transforms/universal/fdedup/src/fdedup_s3_ray.py +++ b/transforms/universal/fdedup/src/fdedup_s3_ray.py @@ -12,7 +12,7 @@ import sys -from data_processing.launch.ray import RayTransformLauncher +from data_processing.runtime.ray import RayTransformLauncher from data_processing.utils import ParamsUtils from fdedup_transform import FdedupRayTransformConfiguration diff --git a/transforms/universal/fdedup/src/fdedup_support.py b/transforms/universal/fdedup/src/fdedup_support.py index 9264f437c..b8c45f933 100644 --- a/transforms/universal/fdedup/src/fdedup_support.py +++ b/transforms/universal/fdedup/src/fdedup_support.py @@ -16,7 +16,7 @@ import numpy as np import ray from data_processing.data_access import DataAccess -from data_processing.launch.ray import RayUtils +from data_processing.runtime.ray import RayUtils from data_processing.utils import GB, RANDOM_SEED, TransformUtils, get_logger from ray import cloudpickle from ray.actor import ActorHandle diff --git a/transforms/universal/fdedup/src/fdedup_transform.py b/transforms/universal/fdedup/src/fdedup_transform.py index c0297ed36..c5873a666 100644 --- a/transforms/universal/fdedup/src/fdedup_transform.py +++ b/transforms/universal/fdedup/src/fdedup_transform.py @@ -20,13 +20,15 @@ import pyarrow as pa import ray from data_processing.data_access import DataAccessFactoryBase -from data_processing.launch.ray import ( +from data_processing.runtime.ray import ( DefaultTableTransformRuntimeRay, RayTransformLauncher, RayUtils, TransformTableProcessorRay, ) -from data_processing.launch.ray.transform_configuration import RayTransformConfiguration +from data_processing.runtime.ray.transform_configuration import ( + RayTransformConfiguration, +) from data_processing.transform import AbstractTableTransform, TransformConfiguration from data_processing.utils import ( RANDOM_SEED, diff --git a/transforms/universal/fdedup/test/test_fdedup_ray.py b/transforms/universal/fdedup/test/test_fdedup_ray.py index 3d910cc8b..96dd3aa3d 100644 --- a/transforms/universal/fdedup/test/test_fdedup_ray.py +++ b/transforms/universal/fdedup/test/test_fdedup_ray.py @@ -12,7 +12,7 @@ import os -from data_processing.launch.ray import RayTransformLauncher +from data_processing.runtime.ray import RayTransformLauncher from data_processing.test_support.launch.transform_test import ( AbstractTransformLauncherTest, ) diff --git a/transforms/universal/filter/src/filter_local_pure.py b/transforms/universal/filter/src/filter_local_pure.py index b8e45cdef..86de7a3fa 100644 --- a/transforms/universal/filter/src/filter_local_pure.py +++ b/transforms/universal/filter/src/filter_local_pure.py @@ -13,12 +13,13 @@ import os import sys -from data_processing.launch.pure_python import PythonTransformLauncher +from data_processing.runtime.pure_python import PythonTransformLauncher from data_processing.utils import ParamsUtils from filter_transform import ( + FilterTransformConfiguration, filter_columns_to_drop_cli_param, filter_criteria_cli_param, - filter_logical_operator_cli_param, FilterTransformConfiguration, + filter_logical_operator_cli_param, ) diff --git a/transforms/universal/filter/src/filter_local_ray.py b/transforms/universal/filter/src/filter_local_ray.py index 1c69801fe..25ef6f600 100644 --- a/transforms/universal/filter/src/filter_local_ray.py +++ b/transforms/universal/filter/src/filter_local_ray.py @@ -13,7 +13,7 @@ import os import sys -from data_processing.launch.ray import RayTransformLauncher +from data_processing.runtime.ray import RayTransformLauncher from data_processing.utils import ParamsUtils from filter_transform import ( FilterRayTransformConfiguration, diff --git a/transforms/universal/filter/src/filter_s3_ray.py b/transforms/universal/filter/src/filter_s3_ray.py index 29bd72b6a..9d83e754d 100644 --- a/transforms/universal/filter/src/filter_s3_ray.py +++ b/transforms/universal/filter/src/filter_s3_ray.py @@ -12,7 +12,7 @@ import sys -from data_processing.launch.ray import RayTransformLauncher +from data_processing.runtime.ray import RayTransformLauncher from data_processing.utils import ParamsUtils from filter_transform import ( FilterRayTransformConfiguration, diff --git a/transforms/universal/filter/src/filter_transform.py b/transforms/universal/filter/src/filter_transform.py index efdd8ecc4..628b54e21 100644 --- a/transforms/universal/filter/src/filter_transform.py +++ b/transforms/universal/filter/src/filter_transform.py @@ -16,12 +16,14 @@ import duckdb import pyarrow as pa -from data_processing.launch.pure_python import ( +from data_processing.runtime.pure_python import ( PythonLauncherConfiguration, PythonTransformLauncher, ) -from data_processing.launch.ray import RayTransformLauncher -from data_processing.launch.ray.transform_configuration import RayTransformConfiguration +from data_processing.runtime.ray import RayTransformLauncher +from data_processing.runtime.ray.transform_configuration import ( + RayTransformConfiguration, +) from data_processing.transform import AbstractTableTransform, TransformConfiguration from data_processing.utils import CLIArgumentProvider, get_logger diff --git a/transforms/universal/filter/test/test_filter_ray.py b/transforms/universal/filter/test/test_filter_ray.py index a176d1193..f04f48e5f 100644 --- a/transforms/universal/filter/test/test_filter_ray.py +++ b/transforms/universal/filter/test/test_filter_ray.py @@ -12,7 +12,7 @@ import os -from data_processing.launch.ray import RayTransformLauncher +from data_processing.runtime.ray import RayTransformLauncher from data_processing.test_support.launch.transform_test import ( AbstractTransformLauncherTest, ) diff --git a/transforms/universal/noop/src/noop_local_pure.py b/transforms/universal/noop/src/noop_local_pure.py index c4503d855..20da4ae0a 100644 --- a/transforms/universal/noop/src/noop_local_pure.py +++ b/transforms/universal/noop/src/noop_local_pure.py @@ -13,10 +13,11 @@ import os import sys -from data_processing.launch.pure_python import PythonTransformLauncher +from data_processing.runtime.pure_python import PythonTransformLauncher from data_processing.utils import ParamsUtils from noop_transform import NOOPTransformConfiguration + # create parameters input_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "test-data", "input")) output_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "output")) diff --git a/transforms/universal/noop/src/noop_local_ray.py b/transforms/universal/noop/src/noop_local_ray.py index bf2e024b7..f5384a0a1 100644 --- a/transforms/universal/noop/src/noop_local_ray.py +++ b/transforms/universal/noop/src/noop_local_ray.py @@ -13,7 +13,7 @@ import os import sys -from data_processing.launch.ray import RayTransformLauncher +from data_processing.runtime.ray import RayTransformLauncher from data_processing.utils import ParamsUtils from noop_transform import NOOPRayTransformConfiguration diff --git a/transforms/universal/noop/src/noop_s3_ray.py b/transforms/universal/noop/src/noop_s3_ray.py index 31196f25a..a902b3c89 100644 --- a/transforms/universal/noop/src/noop_s3_ray.py +++ b/transforms/universal/noop/src/noop_s3_ray.py @@ -13,7 +13,7 @@ import os import sys -from data_processing.launch.ray import RayTransformLauncher +from data_processing.runtime.ray import RayTransformLauncher from data_processing.utils import ParamsUtils from noop_transform import NOOPRayTransformConfiguration diff --git a/transforms/universal/noop/src/noop_transform.py b/transforms/universal/noop/src/noop_transform.py index 720d02cb1..fcf7053a4 100644 --- a/transforms/universal/noop/src/noop_transform.py +++ b/transforms/universal/noop/src/noop_transform.py @@ -15,12 +15,14 @@ from typing import Any import pyarrow as pa -from data_processing.launch.pure_python import ( +from data_processing.runtime.pure_python import ( PythonLauncherConfiguration, PythonTransformLauncher, ) -from data_processing.launch.ray import RayTransformLauncher -from data_processing.launch.ray.transform_configuration import RayTransformConfiguration +from data_processing.runtime.ray import RayTransformLauncher +from data_processing.runtime.ray.transform_configuration import ( + RayTransformConfiguration, +) from data_processing.transform import AbstractTableTransform, TransformConfiguration from data_processing.utils import CLIArgumentProvider, get_logger diff --git a/transforms/universal/noop/test/test_noop_ray.py b/transforms/universal/noop/test/test_noop_ray.py index ee490bd14..42d46ed22 100644 --- a/transforms/universal/noop/test/test_noop_ray.py +++ b/transforms/universal/noop/test/test_noop_ray.py @@ -12,8 +12,8 @@ import os -from data_processing.launch.pure_python import PythonTransformLauncher -from data_processing.launch.ray import RayTransformLauncher +from data_processing.runtime.pure_python import PythonTransformLauncher +from data_processing.runtime.ray import RayTransformLauncher from data_processing.test_support.launch.transform_test import ( AbstractTransformLauncherTest, ) diff --git a/transforms/universal/tokenization/src/tokenization_local_pure.py b/transforms/universal/tokenization/src/tokenization_local_pure.py index 573d671b1..b84dda6d2 100644 --- a/transforms/universal/tokenization/src/tokenization_local_pure.py +++ b/transforms/universal/tokenization/src/tokenization_local_pure.py @@ -13,10 +13,11 @@ import os import sys -from data_processing.launch.pure_python import PythonTransformLauncher +from data_processing.runtime.pure_python import PythonTransformLauncher from data_processing.utils import ParamsUtils from tokenization_transform import TokenizationTransformConfiguration + # create parameters input_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "test-data", "ds01", "input")) output_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "output", "ds01")) diff --git a/transforms/universal/tokenization/src/tokenization_local_ray.py b/transforms/universal/tokenization/src/tokenization_local_ray.py index 7343a4567..482e5ccd9 100644 --- a/transforms/universal/tokenization/src/tokenization_local_ray.py +++ b/transforms/universal/tokenization/src/tokenization_local_ray.py @@ -13,7 +13,7 @@ import os import sys -from data_processing.launch.ray import RayTransformLauncher +from data_processing.runtime.ray import RayTransformLauncher from data_processing.utils import ParamsUtils from tokenization_transform import TokenizationRayConfiguration diff --git a/transforms/universal/tokenization/src/tokenization_local_ray_long_doc.py b/transforms/universal/tokenization/src/tokenization_local_ray_long_doc.py index 7456cbc36..382021230 100644 --- a/transforms/universal/tokenization/src/tokenization_local_ray_long_doc.py +++ b/transforms/universal/tokenization/src/tokenization_local_ray_long_doc.py @@ -13,7 +13,7 @@ import os import sys -from data_processing.launch.ray import RayTransformLauncher +from data_processing.runtime.ray import RayTransformLauncher from data_processing.utils import ParamsUtils from tokenization_transform import TokenizationRayConfiguration diff --git a/transforms/universal/tokenization/src/tokenization_s3_long_doc.py b/transforms/universal/tokenization/src/tokenization_s3_long_doc.py index 81f851812..2491cf742 100644 --- a/transforms/universal/tokenization/src/tokenization_s3_long_doc.py +++ b/transforms/universal/tokenization/src/tokenization_s3_long_doc.py @@ -12,7 +12,7 @@ import sys -from data_processing.launch.ray import RayTransformLauncher +from data_processing.runtime.ray import RayTransformLauncher from data_processing.utils import ParamsUtils from tokenization_transform import TokenizationRayConfiguration diff --git a/transforms/universal/tokenization/src/tokenization_s3_ray.py b/transforms/universal/tokenization/src/tokenization_s3_ray.py index ee7b9059b..d816cbd61 100644 --- a/transforms/universal/tokenization/src/tokenization_s3_ray.py +++ b/transforms/universal/tokenization/src/tokenization_s3_ray.py @@ -13,7 +13,7 @@ import os import sys -from data_processing.launch.ray import RayTransformLauncher +from data_processing.runtime.ray import RayTransformLauncher from data_processing.utils import ParamsUtils from tokenization_transform import TokenizationRayConfiguration diff --git a/transforms/universal/tokenization/src/tokenization_transform.py b/transforms/universal/tokenization/src/tokenization_transform.py index e6ceb6654..e6baffa49 100644 --- a/transforms/universal/tokenization/src/tokenization_transform.py +++ b/transforms/universal/tokenization/src/tokenization_transform.py @@ -20,8 +20,10 @@ from typing import Any import pyarrow as pa -from data_processing.launch.ray import RayTransformLauncher -from data_processing.launch.ray.transform_configuration import RayTransformConfiguration +from data_processing.runtime.ray import RayTransformLauncher +from data_processing.runtime.ray.transform_configuration import ( + RayTransformConfiguration, +) from data_processing.transform import AbstractTableTransform, TransformConfiguration from data_processing.utils import get_logger from tokenization_utils import is_valid_argument_string, load_tokenizer, split_text diff --git a/transforms/universal/tokenization/test/test_tokenization_launch_long_doc.py b/transforms/universal/tokenization/test/test_tokenization_launch_long_doc.py index 2cf61e895..8e7dd018e 100644 --- a/transforms/universal/tokenization/test/test_tokenization_launch_long_doc.py +++ b/transforms/universal/tokenization/test/test_tokenization_launch_long_doc.py @@ -12,7 +12,7 @@ import os -from data_processing.launch.ray import RayTransformLauncher +from data_processing.runtime.ray import RayTransformLauncher from data_processing.test_support.launch.transform_test import ( AbstractTransformLauncherTest, ) diff --git a/transforms/universal/tokenization/test/test_tokenization_ray.py b/transforms/universal/tokenization/test/test_tokenization_ray.py index 03383a836..7bdec0d40 100644 --- a/transforms/universal/tokenization/test/test_tokenization_ray.py +++ b/transforms/universal/tokenization/test/test_tokenization_ray.py @@ -12,7 +12,7 @@ import os -from data_processing.launch.ray import RayTransformLauncher +from data_processing.runtime.ray import RayTransformLauncher from data_processing.test_support.launch.transform_test import ( AbstractTransformLauncherTest, )