[Datasets] Remove "Example: Large-scale ML Ingest" (ray-project#33067)

Co-authored-by: angelinalg <122562471+angelinalg@users.noreply.github.com> Co-authored-by: Clark Zinzow <clarkzinzow@gmail.com>
peytondmurray · Mar 22, 2023 · ab9f2e9 · ab9f2e9
1 parent 79fe1da
commit ab9f2e9
Show file tree

Hide file tree

Showing 7 changed files with 1 addition and 483 deletions.
diff --git a/doc/BUILD b/doc/BUILD
@@ -34,15 +34,6 @@ py_test(
     tags = ["exclusive", "team:ml"]
 )
 
-py_test(
-    name = "big_data_ingestion",
-    size = "small",
-    main = "test_myst_doc.py",
-    srcs = ["test_myst_doc.py"],
-    args = ["--path", "doc/source/data/examples/big_data_ingestion.ipynb"],
-    data = ["//doc/source/data/examples:data_examples"],
-    tags = ["exclusive", "team:core", "py37"]
-)
 
 py_test(
     name = "datasets_train",

diff --git a/doc/source/_toc.yml b/doc/source/_toc.yml
@@ -94,8 +94,6 @@ parts:
                 title: Processing the NYC taxi dataset
               - file: data/examples/batch_training
                 title: Batch Training with Ray Datasets
-              - file: data/examples/big_data_ingestion
-                title: Large-scale ML Ingest
               - file: data/examples/ocr_example
                 title: Scaling OCR with Ray Datasets
               - file: data/examples/advanced-pipelines

diff --git a/doc/source/data/big_data_ingestion.yaml b/doc/source/data/big_data_ingestion.yaml
diff --git a/doc/source/data/dataset.rst b/doc/source/data/dataset.rst
@@ -34,7 +34,7 @@ Data Loading and Preprocessing for ML Training
 ----------------------------------------------
 
 Use Ray Datasets to load and preprocess data for distributed :ref:`ML training pipelines <train-docs>`.
-Compared to other loading solutions, Datasets are more flexible (e.g., can express higher-quality `per-epoch global shuffles <examples/big_data_ingestion.html>`__) and provides `higher overall performance <https://www.anyscale.com/blog/why-third-generation-ml-platforms-are-more-performant>`__.
+Compared to other loading solutions, Datasets are more flexible (e.g., can express higher-quality per-epoch global shuffles) and provides `higher overall performance <https://www.anyscale.com/blog/why-third-generation-ml-platforms-are-more-performant>`__.
 
 Use Datasets as a last-mile bridge from storage or ETL pipeline outputs to distributed 
 applications and libraries in Ray. Don't use it as a replacement for more general data 

diff --git a/doc/source/data/examples/BUILD b/doc/source/data/examples/BUILD
@@ -5,20 +5,3 @@ filegroup(
     srcs = glob(["*.ipynb"]),
     visibility = ["//doc:__subpackages__"]
 )
-
-# --------------------------------------------------------------------
-# Test all doc/source/data/examples notebooks.
-# --------------------------------------------------------------------
-
-# big_data_ingestion.ipynb is not tested right now due to large resource requirements
-# and a need of a general overhaul.
-
-py_test_run_all_notebooks(
-    size = "large",
-    include = ["*.ipynb"],
-    exclude = [
-        "big_data_ingestion.ipynb",
-    ],
-    data = ["//doc/source/data/examples:data_examples"],
-    tags = ["exclusive", "team:ml"]
-)