ray-project · amogkam · Feb 8, 2023 · Jan 26, 2023 · Jan 30, 2023 · Jan 31, 2023
diff --git a/doc/source/data/api/data_representations.rst b/doc/source/data/api/data_representations.rst
@@ -23,7 +23,7 @@ Batch API
 
 .. autosummary::
    :toctree: doc/
-   
+
    block.DataBatch
 
 Row API
@@ -42,6 +42,7 @@ Tensor Column Extension API
 .. autosummary::
    :toctree: doc/
 
+   extensions.tensor_extension.create_ragged_ndarray
    extensions.tensor_extension.TensorDtype
    extensions.tensor_extension.TensorArray
    extensions.tensor_extension.ArrowTensorType

@@ -89,11 +89,21 @@ PyTorch
 ``TorchPredictor``
 ******************
 
-.. automodule:: ray.train.torch
+.. autoclass:: ray.train.torch.TorchPredictor
     :members:
-    :exclude-members: TorchTrainer
     :show-inheritance:
 
+    .. automethod:: __init__
+
+``TorchDetectionPredictor``
+***************************
+
+.. autoclass:: ray.train.torch.TorchDetectionPredictor
+    :members:
+    :show-inheritance:
+
+    .. automethod:: __init__
+
 Horovod
 ~~~~~~~
 

diff --git a/python/ray/air/tests/test_tensor_extension.py b/python/ray/air/tests/test_tensor_extension.py
@@ -13,8 +13,22 @@
     ArrowVariableShapedTensorType,
 )
 from ray.air.util.tensor_extensions.pandas import TensorArray, TensorDtype
+from ray.air.util.tensor_extensions.utils import create_ragged_ndarray
 from ray._private.utils import _get_pyarrow_version
-from ray.air.util.tensor_extensions.utils import _create_strict_ragged_ndarray
+
+
+@pytest.mark.parametrize(
+    "values",
+    [
+        [np.zeros((3, 1)), np.zeros((3, 2))],
+        [np.zeros((3,))],
+    ],
+)
+def test_create_ragged_ndarray(values):
+    ragged_array = create_ragged_ndarray(values)
+    assert len(ragged_array) == len(values)
+    for actual_array, expected_array in zip(ragged_array, values):
+        np.testing.assert_array_equal(actual_array, expected_array)
 
 
 def test_tensor_array_validation():
@@ -582,7 +596,7 @@ def test_arrow_tensor_array_slice(test_arr, dtype):
     for shape in pytest_tensor_array_concat_shapes
 ]
 pytest_tensor_array_concat_arrs += [
-    _create_strict_ragged_ndarray(
+    create_ragged_ndarray(
         [np.arange(4).reshape((2, 2)), np.arange(4, 13).reshape((3, 3))]
     )
 ]

diff --git a/python/ray/air/util/tensor_extensions/arrow.py b/python/ray/air/util/tensor_extensions/arrow.py
@@ -8,7 +8,7 @@
 
 from ray.air.util.tensor_extensions.utils import (
     _is_ndarray_variable_shaped_tensor,
-    _create_strict_ragged_ndarray,
+    create_ragged_ndarray,
 )
 from ray._private.utils import _get_pyarrow_version
 from ray.util.annotations import PublicAPI
@@ -783,7 +783,7 @@ def _to_numpy(self, index: Optional[int] = None, zero_copy_only: bool = False):
             arrs = [self._to_numpy(i, zero_copy_only) for i in range(len(self))]
             # Return ragged NumPy ndarray in the ndarray of ndarray pointers
             # representation.
-            return _create_strict_ragged_ndarray(arrs)
+            return create_ragged_ndarray(arrs)
         data = self.storage.field("data")
         shapes = self.storage.field("shape")
 

diff --git a/python/ray/air/util/tensor_extensions/utils.py b/python/ray/air/util/tensor_extensions/utils.py
@@ -3,6 +3,8 @@
 
 import numpy as np
 
+from ray.util import PublicAPI
+
 if TYPE_CHECKING:
     from pandas.core.dtypes.generic import ABCSeries
 
@@ -60,25 +62,55 @@ def _create_possibly_ragged_ndarray(
             or "The requested array has an inhomogeneous shape" in error_str
         ):
             # Fall back to strictly creating a ragged ndarray.
-            return _create_strict_ragged_ndarray(values)
+            return create_ragged_ndarray(values)
         else:
             # Re-raise original error if the failure wasn't a broadcast error.
             raise e from None
 
 
-def _create_strict_ragged_ndarray(values: Any) -> np.ndarray:
-    """Create a ragged ndarray; the representation will be ragged (1D array of
-    subndarray pointers) even if it's possible to represent it as a non-ragged ndarray.
-    """
-    # Use the create-empty-and-fill method. This avoids the following pitfalls of the
-    # np.array constructor - np.array(values, dtype=object):
-    #  1. It will fail to construct an ndarray if the first element dimension is
-    #  uniform, e.g. for imagery whose first element dimension is the channel.
-    #  2. It will construct the wrong representation for a single-row column (i.e. unit
-    #  outer dimension). Namely, it will consolidate it into a single multi-dimensional
-    #  ndarray rather than a 1D array of subndarray pointers, resulting in the single
-    #  row not being well-typed (having object dtype).
+@PublicAPI(stability="alpha")
+def create_ragged_ndarray(values: Sequence[np.ndarray]) -> np.ndarray:
+    """Create an array that contains arrays of different length
+
+    If you're working with variable-length arrays like images, use this function to
+    create ragged arrays instead of ``np.array``.
+
+    .. note::
+        ``np.array`` fails to construct ragged arrays if the input arrays have a uniform
+        first dimension:
+
+        .. testsetup::
+
+            import numpy as np
+            from ray.air.util.tensor_extensions.utils import create_ragged_ndarray
+
+        .. doctest::
+
+            >>> values = [np.zeros((3, 1)), np.zeros((3, 2))]
+            >>> np.array(values, dtype=object)
+            Traceback (most recent call last):
+                ...
+            ValueError: could not broadcast input array from shape (3,1) into shape (3,)
+            >>> create_ragged_ndarray(values)
+            array([array([[0.],
+                          [0.],
+                          [0.]]), array([[0., 0.],
+                                         [0., 0.],
+                                         [0., 0.]])], dtype=object)
+
+        Or if you're creating a ragged array from a single array:
+
+        .. doctest::
+
+            >>> values = [np.zeros((3, 1))]
+            >>> np.array(values, dtype=object)[0].dtype
+            dtype('O')
+            >>> create_ragged_ndarray(values)[0].dtype
+            dtype('float64')
 
+        ``create_ragged_ndarray`` avoids the limitations of ``np.array`` by creating an
+        empty array and filling it with pointers to the variable-length arrays.
+    """  # noqa: E501
     # Create an empty object-dtyped 1D array.
     arr = np.empty(len(values), dtype=object)
     # Try to fill the 1D array of pointers with the (ragged) tensors.

diff --git a/python/ray/data/extensions/tensor_extension.py b/python/ray/data/extensions/tensor_extension.py
@@ -10,3 +10,4 @@
     ArrowVariableShapedTensorType,
     ArrowVariableShapedTensorArray,
 )
+from ray.air.util.tensor_extensions.utils import create_ragged_ndarray  # noqa: F401
diff --git a/python/ray/train/_internal/dl_predictor.py b/python/ray/train/_internal/dl_predictor.py
@@ -1,13 +1,13 @@
 import abc
-from typing import Dict, TypeVar, Union
+from typing import Dict, Optional, TypeVar, Union
 
 import numpy as np
 import pandas as pd
 
 from ray.air.util.data_batch_conversion import (
     BatchFormat,
-    convert_pandas_to_batch_type,
     convert_batch_type_to_pandas,
+    convert_pandas_to_batch_type,
 )
 from ray.train.predictor import Predictor
 from ray.util.annotations import DeveloperAPI
@@ -21,7 +21,7 @@ class DLPredictor(Predictor):
     def _arrays_to_tensors(
         self,
         numpy_arrays: Union[np.ndarray, Dict[str, np.ndarray]],
-        dtype: Union[TensorDtype, Dict[str, TensorDtype]],
+        dtype: Optional[Union[TensorDtype, Dict[str, TensorDtype]]],
     ) -> Union[TensorType, Dict[str, TensorType]]:
         """Converts a NumPy ndarray batch to the tensor type for the DL framework.
 
@@ -72,7 +72,9 @@ def preferred_batch_format(cls) -> BatchFormat:
         return BatchFormat.NUMPY
 
     def _predict_pandas(
-        self, data: pd.DataFrame, dtype: Union[TensorDtype, Dict[str, TensorDtype]]
+        self,
+        data: pd.DataFrame,
+        dtype: Optional[Union[TensorDtype, Dict[str, TensorDtype]]],
     ) -> pd.DataFrame:
         numpy_input = convert_pandas_to_batch_type(
             data,
@@ -85,7 +87,7 @@ def _predict_pandas(
     def _predict_numpy(
         self,
         data: Union[np.ndarray, Dict[str, np.ndarray]],
-        dtype: Union[TensorDtype, Dict[str, TensorDtype]],
+        dtype: Optional[Union[TensorDtype, Dict[str, TensorDtype]]],
     ) -> Union[np.ndarray, Dict[str, np.ndarray]]:
         # Single column selection return numpy array so preprocessors can be
         # reused in both training and prediction

diff --git a/python/ray/train/tensorflow/tensorflow_predictor.py b/python/ray/train/tensorflow/tensorflow_predictor.py
@@ -225,9 +225,9 @@ def predict(
     def _arrays_to_tensors(
         self,
         numpy_arrays: Union[np.ndarray, Dict[str, np.ndarray]],
-        dtypes: Union[tf.dtypes.DType, Dict[str, tf.dtypes.DType]],
+        dtype: Optional[Union[tf.dtypes.DType, Dict[str, tf.dtypes.DType]]],
     ) -> Union[tf.Tensor, Dict[str, tf.Tensor]]:
-        return convert_ndarray_batch_to_tf_tensor_batch(numpy_arrays, dtypes=dtypes)
+        return convert_ndarray_batch_to_tf_tensor_batch(numpy_arrays, dtypes=dtype)
 
     def _tensor_to_array(self, tensor: tf.Tensor) -> np.ndarray:
         if not isinstance(tensor, tf.Tensor):

@@ -0,0 +1,59 @@
+import numpy as np
+import pytest
+from torchvision import models
+
+import ray
+from ray.air.util.tensor_extensions.utils import create_ragged_ndarray
+from ray.train.batch_predictor import BatchPredictor
+from ray.train.torch import TorchCheckpoint, TorchDetectionPredictor
+
+
+@pytest.fixture(name="predictor")
+def predictor_fixture():
+    model = models.detection.maskrcnn_resnet50_fpn()
+    yield TorchDetectionPredictor(model=model)
+
+
+@pytest.mark.parametrize(
+    "data",
+    [
+        np.zeros((1, 3, 32, 32), dtype=np.float32),
+        {"image": np.zeros((1, 3, 32, 32), dtype=np.float32)},
+        create_ragged_ndarray(
+            [
+                np.zeros((3, 32, 32), dtype=np.float32),
+                np.zeros((3, 64, 64), dtype=np.float32),
+            ]
+        ),
+    ],
+)
+def test_predict(predictor, data):
+    predictions = predictor.predict(data)
+
+    assert all(len(value) == len(data) for value in predictions.values())
+    # Boxes should have shape `(# detections, 4)`.
+    assert all(boxes.ndim == 2 for boxes in predictions["pred_boxes"])
+    assert all(boxes.shape[-1] == 4 for boxes in predictions["pred_boxes"])
+    # Labels should have shape `(# detections,)`.
+    assert all(labels.ndim == 1 for labels in predictions["pred_labels"])
+    # Scores should have shape `(# detections,)`.
+    assert all(labels.ndim == 1 for labels in predictions["pred_scores"])
+
+
+def test_multi_column_batch_raises_value_error(predictor):
+    data = {
+        "image": np.zeros((2, 3, 32, 32), dtype=np.float32),
+        "boxes": np.zeros((2, 0, 4), dtype=np.float32),
+        "labels": np.zeros((2, 0), dtype=np.int64),
+    }
+    with pytest.raises(ValueError):
+        # `data` should only contain one key. Otherwise, `TorchDetectionPredictor`
+        # doesn't know which column contains the input images.
+        predictor.predict(data)
+
+
+def test_invalid_dtype_raises_value_error(predictor):
+    data = np.zeros((1, 3, 32, 32), dtype=np.float32)
+    with pytest.raises(ValueError):
+        # `dtype` should be a single `torch.dtype`.
+        predictor.predict(data, dtype=np.float32)
@@ -7,8 +7,9 @@
     )
 # isort: on
 
-from ray.train.torch.torch_checkpoint import TorchCheckpoint
 from ray.train.torch.config import TorchConfig
+from ray.train.torch.torch_checkpoint import TorchCheckpoint
+from ray.train.torch.torch_detection_predictor import TorchDetectionPredictor
 from ray.train.torch.torch_predictor import TorchPredictor
 from ray.train.torch.torch_trainer import TorchTrainer
 from ray.train.torch.train_loop_utils import (
@@ -33,4 +34,5 @@
     "backward",
     "enable_reproducibility",
     "TorchPredictor",
+    "TorchDetectionPredictor",
 ]