[no_early_kickoff] [Tune] Update Tuner.restore usage to prepare for…

… `trainable` becoming a required arg (#32912) * Fix links to internal docs Signed-off-by: Justin Yu <justinvyu@berkeley.edu>
ray-project · Mar 28, 2023 · b6958ed · b6958ed
1 parent 9b8d4ce
commit b6958ed
Show file tree

Hide file tree

Showing 12 changed files with 99 additions and 98 deletions.
diff --git a/doc/source/ray-air/doc_code/tuner.py b/doc/source/ray-air/doc_code/tuner.py
@@ -129,6 +129,7 @@
 )
 # __tune_preprocess_end__
 
+
 # __tune_dataset_start__
 def get_dataset():
     return ray.data.read_csv("s3://anonymous@air-example-data/breast_cancer.csv")
@@ -240,6 +241,8 @@ def get_another_dataset():
 # __tune_config_end__
 
 # __tune_restore_start__
-tuner = Tuner.restore("~/ray_results/test_tuner", restart_errored=True)
+tuner = Tuner.restore(
+    path="~/ray_results/test_tuner", trainable=trainer, restart_errored=True
+)
 tuner.fit()
 # __tune_restore_end__
diff --git a/doc/source/tune/examples/tune_analyze_results.ipynb b/doc/source/tune/examples/tune_analyze_results.ipynb
diff --git a/doc/source/tune/tutorials/tune-storage.rst b/doc/source/tune/tutorials/tune-storage.rst
@@ -248,6 +248,7 @@ you can resume it any time starting from the experiment checkpoint state saved i
     from ray import tune
     tuner = tune.Tuner.restore(
         "s3://my-checkpoints-bucket/path/my-tune-exp",
+        trainable=my_trainable,
         resume_errored=True
     )
     tuner.fit()
@@ -307,6 +308,7 @@ This experiment can be resumed from the head node:
     from ray import tune
     tuner = tune.Tuner.restore(
         "/tmp/mypath/my-tune-exp",
+        trainable=my_trainable,
         resume_errored=True
     )
     tuner.fit()
diff --git a/doc/source/tune/tutorials/tune_get_data_in_and_out.md b/doc/source/tune/tutorials/tune_get_data_in_and_out.md
@@ -9,7 +9,6 @@ Often, you will find yourself needing to pass data into Tune [Trainables](tune_6
 
 Let's start by defining a simple Trainable function. We'll be expanding this function with different functionality as we go.
 
-
 ```python
 import random
 import time
@@ -36,7 +35,6 @@ Our `training_function` function requires a pandas DataFrame, a model with some
 
 We will run hyperparameter optimization using the [Tuner API](tune-run-ref).
 
-
 ```python
 from ray.tune import Tuner
 from ray import tune
@@ -78,7 +76,6 @@ Instead, use strings or other identifiers as your values, and initialize/load th
 
 In our example, we want to tune the two model hyperparameters. We also want to set the number of epochs, so that we can easily tweak it later. For the hyperparameters, we will use the `tune.uniform` distribution. We will also modify the `training_function` to obtain those values from the `config` dictionary.
 
-
 ```python
 def training_function(config):
     # For now, we have nothing here.
@@ -126,7 +123,6 @@ Note that the serialization (once) and deserialization (for each Trial) of large
 
 In our example, we will pass the `data` DataFrame using `tune.with_parameters`. In order to do that, we need to modify our function signature to include `data` as an argument.
 
-
 ```python
 def training_function(config, data):
     model = {
@@ -157,7 +153,6 @@ tuner = Tuner(
 
 Next step is to wrap the `training_function` using `tune.with_parameters` before passing it into the `Tuner`. Every keyword argument of the `tune.with_parameters` call will be mapped to the keyword arguments in the Trainable signature.
 
-
 ```python
 data = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]})
 
@@ -184,15 +179,12 @@ A common use-case is to load the dataset from S3 or any other cloud storage with
 
 The working directory of the Trainable worker will be automatically changed to the corresponding Trial directory. For more details, see {ref}`tune-working-dir`.
 
-
 Our tuning run can now be run, though we will not yet obtain any meaningful outputs back.
 
-
 ```python
 results = tuner.fit()
 ```
 
-
 ## Getting data out of Ray Tune
 
 We can now run our tuning run using the `training_function` Trainable. The next step is to report *metrics* to Tune that can be used to guide the optimization. We will also want to *checkpoint* our trained models so that we can resume the training after an interruption, and to use them for prediction later.
@@ -213,7 +205,6 @@ Tune will automatically include some metrics, such as the training iteration, ti
 
 In our example, we want to maximize the `metric`. We will report it each epoch to Tune, and set the `metric` and `mode` arguments in `tune.TuneConfig` to let Tune know that it should use it as the optimization objective.
 
-
 ```python
 from ray.air import session
 
@@ -255,7 +246,6 @@ Callbacks are passed in the `callback` argument of the `Tuner`'s `RunConfig`.
 
 In our example, we'll use the MLFlow callback to track the progress of our tuning run and the changing value of the `metric` (requires `mlflow` to be installed).
 
-
 ```python
 from ray.air import RunConfig
 from ray.air.integrations.mlflow import MLflowLoggerCallback
@@ -309,7 +299,6 @@ The experiment state itself is checkpointed separately. See {ref}`tune-persisted
 
 In our example, we want to be able to resume the training from the latest checkpoint, and to save the `trained_model` in a checkpoint every iteration. To accomplish this, we will use the `session` and `Checkpoint` APIs.
 
-
 ```python
 from ray.air import Checkpoint
 
@@ -358,19 +347,12 @@ tuner = Tuner(
 
 With all of those changes implemented, we can now run our tuning and obtain meaningful metrics and artifacts.
 
-
 ```python
 results = tuner.fit()
 results.get_dataframe()
 ```
 
-
-
-    2022-11-30 17:40:28,839	INFO tune.py:762 -- Total run time: 15.79 seconds (15.65 seconds for the tuning loop).
-
-
-
-
+    2022-11-30 17:40:28,839 INFO tune.py:762 -- Total run time: 15.79 seconds (15.65 seconds for the tuning loop).
 
 <div>
 <style scoped>
@@ -515,13 +497,10 @@ results.get_dataframe()
 <p>4 rows × 23 columns</p>
 </div>
 
-
-
 Checkpoints, metrics, and the log directory for each trial can be accessed through the `ResultGrid` output of a Tune experiment. For more information on how to interact with the returned `ResultGrid`, see {doc}`/tune/examples/tune_analyze_results`.
 
-
 ### How do I access Tune results after I am finished?
 
 After you have finished running the Python session, you can still access the results and checkpoints. By default, Tune will save the experiment results to the `~/ray_results` local directory. You can configure Tune to persist results in the cloud as well. See {ref}`tune-storage-options` for more information on how to configure storage options for persisting experiment results.
 
-You can restore the Tune experiment by calling `Tuner.restore(path_or_cloud_uri)`, where `path_or_cloud_uri` points to a location either on the filesystem or cloud where the experiment was saved to. After the `Tuner` has been restored, you can access the results and checkpoints by calling `Tuner.get_results()` to receive the `ResultGrid` object, and then proceeding as outlined in the previous section.
+You can restore the Tune experiment by calling {meth}`Tuner.restore(path_or_cloud_uri, trainable) <ray.tune.Tuner.restore>`, where `path_or_cloud_uri` points to a location either on the filesystem or cloud where the experiment was saved to. After the `Tuner` has been restored, you can access the results and checkpoints by calling `Tuner.get_results()` to receive the `ResultGrid` object, and then proceeding as outlined in the previous section.
diff --git a/python/ray/train/tests/test_tune.py b/python/ray/train/tests/test_tune.py
@@ -261,12 +261,11 @@ def train_func(config):
         with pytest.warns() as warn_record:
             tuner = Tuner.restore(
                 str(tmpdir / "restore_new_trainer"),
-                overwrite_trainable=trainer,
+                trainable=trainer,
                 resume_errored=True,
             )
         # Should warn about the RunConfig being ignored
         assert any("RunConfig" in str(record.message) for record in warn_record)
-        assert "The trainable will be overwritten" in caplog.text
 
     results = tuner.fit()
     assert not results.errors

diff --git a/python/ray/tune/impl/tuner_internal.py b/python/ray/tune/impl/tuner_internal.py
@@ -232,6 +232,7 @@ def _validate_overwrite_trainable(
         (ensuring same type and name as the original trainable).
         """
 
+        # TODO(ml-team): Remove (https://github.com/ray-project/ray/issues/33546)
         # Check if the trainable was wrapped with `tune.with_parameters`,
         # Set the Tuner to fail on fit if the trainable is not re-specified.
         trainable_wrapped_params = getattr(
@@ -249,8 +250,8 @@ def _validate_overwrite_trainable(
                 "trainable_with_params = tune.with_parameters(trainable, ...)\n"
                 "tuner = tune.Tuner.restore(\n"
                 "    ..., trainable=trainable_with_params\n"
-                ")\n\nSee https://docs.ray.io/en/master/tune/api_docs/trainable.html"
-                "#tune-with-parameters for more details."
+                ")\n\nSee https://docs.ray.io/en/latest/tune/api/doc/"
+                "ray.tune.with_parameters.html for more details."
             )
         if not overwrite_trainable:
             return
@@ -286,14 +287,6 @@ def _validate_overwrite_trainable(
                     f"{overwrite_name} but expected {original_name}."
                 )
 
-        logger.warning(
-            "The trainable will be overwritten - this should be done with caution: "
-            "it's possible to supply an incompatible trainable, and there are "
-            "no guarantees that the resumed experiment will continue successfully. "
-            "If you encounter errors during training, ensure that you are passing "
-            "in the same trainable that was passed into the initial `Tuner` object."
-        )
-
     def _restore_from_path_or_uri(
         self,
         path_or_uri: str,

diff --git a/python/ray/tune/tests/test_result_grid.py b/python/ray/tune/tests/test_result_grid.py
@@ -392,7 +392,7 @@ def train_func(config):
     result_grid = tuner.fit()
 
     assert result_grid[0].checkpoint
-    for (checkpoint, metric) in result_grid[0].best_checkpoints:
+    for checkpoint, metric in result_grid[0].best_checkpoints:
         assert checkpoint
     assert len(result_grid[0].best_checkpoints) == num_to_keep
 
@@ -404,12 +404,12 @@ def train_func(config):
     )
 
     result_grid = tune.Tuner.restore(
-        str(tmpdir / "moved_ray_results" / "new_exp_dir")
+        str(tmpdir / "moved_ray_results" / "new_exp_dir"), trainable=train_func
     ).get_results()
     checkpoint_data = []
 
     assert len(result_grid[0].best_checkpoints) == num_to_keep
-    for (checkpoint, _) in result_grid[0].best_checkpoints:
+    for checkpoint, _ in result_grid[0].best_checkpoints:
         assert checkpoint
         assert "moved_ray_results" in checkpoint._local_path
         assert checkpoint._local_path.startswith(result_grid._local_path)

diff --git a/python/ray/tune/tests/test_syncer.py b/python/ray/tune/tests/test_syncer.py
@@ -912,7 +912,7 @@ def train_func(config):
     # Check the contents of the upload_dir immediately after the experiment
     # This won't be up to date if we don't wait on the last sync
     download_from_uri("memory:///test_upload_dir/exp_name", tmpdir)
-    cloud_results = tune.Tuner.restore(str(tmpdir)).get_results()
+    cloud_results = tune.Tuner.restore(str(tmpdir), trainable=train_func).get_results()
     last_reported_iter = cloud_results[0].metrics.get("training_iteration", None)
     assert last_reported_iter == 8, (
         "Experiment did not wait to finish the final experiment sync before exiting. "
@@ -986,7 +986,7 @@ def train_fn(config):
 
     shutil.rmtree(local_dir)  # Rely on sync-down from cloud
     tuner = tune.Tuner.restore(
-        str(URI(mock_s3_bucket_uri) / exp_name), resume_errored=True
+        str(URI(mock_s3_bucket_uri) / exp_name), trainable=train_fn, resume_errored=True
     )
     result_grid = tuner.fit()
 

diff --git a/python/ray/tune/tests/test_tuner.py b/python/ray/tune/tests/test_tuner.py
@@ -213,7 +213,7 @@ def on_step_end(self, iteration, trials, **kwargs):
 
         # Test resume
         restore_path = os.path.join(DEFAULT_RESULTS_DIR, "test_tuner_driver_fail")
-        tuner = Tuner.restore(restore_path)
+        tuner = Tuner.restore(restore_path, trainable=trainer)
         # A hack before we figure out RunConfig semantics across resumes.
         tuner._local_tuner._run_config.callbacks = None
         results = tuner.fit()