Adds raycasting-based camera implementation (#159)

This MR adds the `RayCasterCamera` as a ray-cast-based camera for "distance_to_camera", "distance_to_image_plane" and "normals" annotations. It has the same interface and functionalities as the USD Camera wrapper, while it is, on average, 30% faster. Fixes #131 - Bug fix (non-breaking change which fixes an issue) - New feature (non-breaking change which adds functionality) - This change requires a documentation update - [x] I have run the [`pre-commit` checks](https://pre-commit.com/) with `./orbit.sh --format` - [ ] I have made corresponding changes to the documentation - [x] My changes generate no new warnings - [x] I have added tests that prove my fix is effective or that my feature works - [x] I have updated the changelog and the corresponding version in the extension's `config/extension.toml` file --------- Signed-off-by: Pascal Roth <57946385+pascal-roth@users.noreply.github.com> Co-authored-by: Mayank Mittal <mittalma@leggedrobotics.com>
isaac-sim · Nov 1, 2023 · 5c3fa70 · 5c3fa70
1 parent e7d43de
commit 5c3fa70
Show file tree

Hide file tree

Showing 20 changed files with 1,567 additions and 234 deletions.
diff --git a/source/extensions/omni.isaac.orbit/config/extension.toml b/source/extensions/omni.isaac.orbit/config/extension.toml
@@ -1,7 +1,7 @@
 [package]
 
 # Note: Semantic Versioning is used: https://semver.org/
-version = "0.9.30"
+version = "0.9.31"
 
 # Description
 title = "ORBIT framework for Robot Learning"

diff --git a/source/extensions/omni.isaac.orbit/docs/CHANGELOG.rst b/source/extensions/omni.isaac.orbit/docs/CHANGELOG.rst
@@ -1,6 +1,17 @@
 Changelog
 ---------
 
+0.9.31 (2023-11-02)
+~~~~~~~~~~~~~~~~~~~
+
+Added
+^^^^^
+
+* Added the :class:`omni.isaac.orbit.sensors.RayCasterCamera` class, as a ray-casting based camera for
+  "distance_to_camera", "distance_to_image_plane" and "normals" annotations. It has the same interface and
+  functionalities as the USD Camera while it is on average 30% faster.
+
+
 0.9.30 (2023-11-01)
 ~~~~~~~~~~~~~~~~~~~
 
@@ -155,7 +166,6 @@ Added
 
 
 0.9.18 (2023-10-23)
-~~~~~~~~~~~~~~~~~~~
 
 Added
 ^^^^^

diff --git a/source/extensions/omni.isaac.orbit/omni/isaac/orbit/markers/config/__init__.py b/source/extensions/omni.isaac.orbit/omni/isaac/orbit/markers/config/__init__.py
@@ -19,7 +19,7 @@
             radius=0.02,
             visual_material=sim_utils.PreviewSurfaceCfg(diffuse_color=(1.0, 0.0, 0.0)),
         ),
-    }
+    },
 )
 """Configuration for the ray-caster marker."""
 

diff --git a/source/extensions/omni.isaac.orbit/omni/isaac/orbit/sensors/camera/camera.py b/source/extensions/omni.isaac.orbit/omni/isaac/orbit/sensors/camera/camera.py
@@ -13,12 +13,10 @@
 from typing_extensions import Literal
 
 import omni.isaac.core.utils.prims as prim_utils
-import omni.isaac.core.utils.stage as stage_utils
 import omni.kit.commands
 import omni.usd
 from omni.isaac.core.prims import XFormPrimView
-from omni.isaac.core.utils.rotations import gf_quat_to_np_array
-from pxr import Gf, Usd, UsdGeom
+from pxr import Usd, UsdGeom
 
 # omni-isaac-orbit
 from omni.isaac.orbit.utils import to_camel_case
@@ -27,7 +25,7 @@
 
 from ..sensor_base import SensorBase
 from .camera_data import CameraData
-from .utils import convert_orientation_convention
+from .utils import convert_orientation_convention, create_rotation_matrix_from_view
 
 if TYPE_CHECKING:
     from .camera_cfg import CameraCfg
@@ -66,6 +64,8 @@ class Camera(SensorBase):
 
     cfg: CameraCfg
     """The configuration parameters."""
+    UNSUPPORTED_TYPES: set[str] = {"bounding_box_2d_tight", "bounding_box_2d_loose", "bounding_box_3d"}
+    """The set of sensor types that are not supported by the camera class."""
 
     def __init__(self, cfg: CameraCfg):
         """Initializes the camera sensor.
@@ -101,8 +101,7 @@ def __init__(self, cfg: CameraCfg):
         self._data = CameraData()
         # check if there is any intersection in unsupported types
         # reason: these use np structured data types which we can't yet convert to torch tensor
-        unsupported_types = {"bounding_box_2d_tight", "bounding_box_2d_loose", "bounding_box_3d"}
-        common_elements = set(self.cfg.data_types) & unsupported_types
+        common_elements = set(self.cfg.data_types) & Camera.UNSUPPORTED_TYPES
         if common_elements:
             raise ValueError(
                 f"Camera class does not support the following sensor types: {common_elements}."
@@ -167,7 +166,7 @@ def image_shape(self) -> tuple[int, int]:
     """
 
     def set_intrinsic_matrices(
-        self, matrices: torch.Tensor, focal_length: float = 1.0, indices: Sequence[int] | None = None
+        self, matrices: torch.Tensor, focal_length: float = 1.0, env_ids: Sequence[int] | None = None
     ):
         """Set parameters of the USD camera from its intrinsic matrix.
 
@@ -186,20 +185,15 @@ def set_intrinsic_matrices(
             is not true in the input intrinsic matrix, then the camera will not set up correctly.
 
         Args:
-            matrices: The intrinsic matrices for the camera. Shape is :math:`(N, 3, 3)`.
+            matrices: The intrinsic matrices for the camera. Shape is (N, 3, 3).
             focal_length: Focal length to use when computing aperture values. Defaults to 1.0.
-            indices: A list of indices of length :obj:`N` to specify the prims to manipulate.
-                Defaults to None, which means all prims will be manipulated.
+            env_ids: A sensor ids to manipulate. Defaults to None, which means all sensor indices.
         """
-        # resolve indices
-        # check camera prim exists
-        if not self._is_initialized:
-            raise RuntimeError("Camera is not initialized. Please call 'sim.play()' first.")
-        # resolve indices
-        if indices is None:
-            indices = self._ALL_INDICES
-        # iterate over indices
-        for i, matrix in zip(indices, matrices):
+        # resolve env_ids
+        if env_ids is None:
+            env_ids = self._ALL_INDICES
+        # iterate over env_ids
+        for i, matrix in zip(env_ids, matrices):
             # convert to numpy for sanity
             intrinsic_matrix = np.asarray(matrix, dtype=float)
             # extract parameters from matrix
@@ -240,40 +234,35 @@ def set_world_poses(
         self,
         positions: torch.Tensor | None = None,
         orientations: torch.Tensor | None = None,
-        indices: Sequence[int] | None = None,
+        env_ids: Sequence[int] | None = None,
         convention: Literal["opengl", "ros", "world"] = "ros",
     ):
         r"""Set the pose of the camera w.r.t. the world frame using specified convention.
 
         Since different fields use different conventions for camera orientations, the method allows users to
         set the camera poses in the specified convention. Possible conventions are:
 
-        - :obj:"opengl" - forward axis: -Z - up axis +Y - Offset is applied in the OpenGL (Usd.Camera) convention
-        - :obj:"ros"    - forward axis: +Z - up axis -Y - Offset is applied in the ROS convention
-        - :obj:"world"  - forward axis: +X - up axis +Z - Offset is applied in the World Frame convention
+        - :obj:`"opengl"` - forward axis: -Z - up axis +Y - Offset is applied in the OpenGL (Usd.Camera) convention
+        - :obj:`"ros"`    - forward axis: +Z - up axis -Y - Offset is applied in the ROS convention
+        - :obj:`"world"`  - forward axis: +X - up axis +Z - Offset is applied in the World Frame convention
 
         See :meth:`omni.isaac.orbit.sensors.camera.utils.convert_orientation_convention` for more details
         on the conventions.
 
         Args:
-            positions: The cartesian coordinates (in meters). Shape is :math:`(N, 3)`.
+            positions: The cartesian coordinates (in meters). Shape is (N, 3).
                 Defaults to None, in which case the camera position in not changed.
-            orientations: The quaternion orientation in (w, x, y, z). Shape is :math:`(N, 4)`.
+            orientations: The quaternion orientation in (w, x, y, z). Shape is (N, 4).
                 Defaults to None, in which case the camera orientation in not changed.
-            indices: A list of indices of length :obj:`N` to specify the prims to manipulate.
-                Defaults to None, which means all prims will be manipulated.
-            convention: The convention in which the poses are fed.
-                Defaults to "ros".
+            env_ids: A sensor ids to manipulate. Defaults to None, which means all sensor indices.
+            convention: The convention in which the poses are fed. Defaults to "ros".
 
         Raises:
             RuntimeError: If the camera prim is not set. Need to call :meth:`initialize` method first.
         """
-        # check camera prim exists
-        if not self._is_initialized:
-            raise RuntimeError("Camera is not initialized. Please call 'sim.play()' first.")
-        # resolve indices
-        if indices is None:
-            indices = self._ALL_INDICES
+        # resolve env_ids
+        if env_ids is None:
+            env_ids = self._ALL_INDICES
         # convert to backend tensor
         if positions is not None:
             if isinstance(positions, np.ndarray):
@@ -288,71 +277,28 @@ def set_world_poses(
                 orientations = torch.tensor(orientations, device=self._device)
             orientations = convert_orientation_convention(orientations, origin=convention, target="opengl")
         # set the pose
-        self._view.set_world_poses(positions, orientations, indices)
+        self._view.set_world_poses(positions, orientations, env_ids)
 
     def set_world_poses_from_view(
-        self, eyes: torch.Tensor, targets: torch.Tensor, indices: Sequence[int] | None = None
+        self, eyes: torch.Tensor, targets: torch.Tensor, env_ids: Sequence[int] | None = None
     ):
         """Set the poses of the camera from the eye position and look-at target position.
 
         Args:
-            eyes: The positions of the camera's eye. Shape is :math:`(N, 3)`.
-            targets: The target locations to look at. Shape is :math:`(N, 3)`.
-            indices: A list of indices of length :math:`N` to specify the prims to manipulate.
-                Defaults to None, which means all prims will be manipulated.
+            eyes: The positions of the camera's eye. Shape is (N, 3).
+            targets: The target locations to look at. Shape is (N, 3).
+            env_ids: A sensor ids to manipulate. Defaults to None, which means all sensor indices.
 
         Raises:
             RuntimeError: If the camera prim is not set. Need to call :meth:`initialize` method first.
             NotImplementedError: If the stage up-axis is not "Y" or "Z".
         """
-        # check camera prim exists
-        if not self._is_initialized:
-            raise RuntimeError("Camera is not initialized. Please 'sim.play()' first.")
-        # resolve indices
-        if indices is None:
-            indices = self._ALL_INDICES
-        # create tensors for storing poses
-        positions = torch.zeros((len(indices), 3), device=self._device)
-        orientations = torch.zeros((len(indices), 4), device=self._device)
-        # iterate over all indices
-        # TODO: Can we do this in parallel?
-        for i, eye, target in zip(indices, eyes, targets):
-            # compute camera's eye pose
-            eye_position = Gf.Vec3d(np.asarray(eye).tolist())
-            target_position = Gf.Vec3d(np.asarray(target).tolist())
-            # compute forward direction
-            forward_dir = (eye_position - target_position).GetNormalized()
-            # get up axis
-            up_axis_token = stage_utils.get_stage_up_axis()
-            if up_axis_token == UsdGeom.Tokens.y:
-                # deal with degenerate case
-                if forward_dir == Gf.Vec3d(0, 1, 0):
-                    up_axis = Gf.Vec3d(0, 0, 1)
-                elif forward_dir == Gf.Vec3d(0, -1, 0):
-                    up_axis = Gf.Vec3d(0, 0, -1)
-                else:
-                    up_axis = Gf.Vec3d(0, 1, 0)
-            elif up_axis_token == UsdGeom.Tokens.z:
-                # deal with degenerate case
-                if forward_dir == Gf.Vec3d(0, 0, 1):
-                    up_axis = Gf.Vec3d(0, 1, 0)
-                elif forward_dir == Gf.Vec3d(0, 0, -1):
-                    up_axis = Gf.Vec3d(0, -1, 0)
-                else:
-                    up_axis = Gf.Vec3d(0, 0, 1)
-            else:
-                raise NotImplementedError(f"This method is not supported for up-axis '{up_axis_token}'.")
-            # compute matrix transformation
-            # view matrix: camera_T_world
-            matrix_gf = Gf.Matrix4d(1).SetLookAt(eye_position, target_position, up_axis)
-            # camera position and rotation in world frame
-            matrix_gf = matrix_gf.GetInverse()
-            positions[i] = torch.from_numpy(np.asarray(matrix_gf.ExtractTranslation())).to(device=self._device)
-            orientations[i] = torch.from_numpy(gf_quat_to_np_array(matrix_gf.ExtractRotationQuat())).to(
-                device=self._device
-            )
+        # resolve env_ids
+        if env_ids is None:
+            env_ids = self._ALL_INDICES
         # set camera poses using the view
-        self._view.set_world_poses(positions, orientations, indices)
+        orientations = quat_from_matrix(create_rotation_matrix_from_view(eyes, targets, device=self._device))
+        self._view.set_world_poses(eyes, orientations, env_ids)
 
     """
     Operations
@@ -401,7 +347,7 @@ def _initialize_impl(self):
                 f" the number of environments ({self._num_envs})."
             )
 
-        # Create all indices buffer
+        # Create all env_ids buffer
         self._ALL_INDICES = torch.arange(self._view.count, device=self._device, dtype=torch.long)
         # Create frame count buffer
         self._frame = torch.zeros(self._view.count, device=self._device, dtype=torch.long)
@@ -455,9 +401,6 @@ def _initialize_impl(self):
         self._create_buffers()
 
     def _update_buffers_impl(self, env_ids: Sequence[int]):
-        # check camera prim exists
-        if not self._is_initialized:
-            raise RuntimeError("Camera is not initialized. Please call 'sim.play()' first.")
         # Increment frame count
         self._frame[env_ids] += 1
         # -- intrinsic matrix
@@ -493,9 +436,7 @@ def _create_buffers(self):
         # create the data object
         # -- pose of the cameras
         self._data.pos_w = torch.zeros((self._view.count, 3), device=self._device)
-        self._data.quat_w_ros = torch.zeros((self._view.count, 4), device=self._device)
-        self._data.quat_w_world = torch.zeros_like(self._data.quat_w_ros)
-        self._data.quat_w_opengl = torch.zeros_like(self._data.quat_w_ros)
+        self._data.quat_w_world = torch.zeros((self._view.count, 4), device=self._device)
         # -- intrinsic matrix
         self._data.intrinsic_matrices = torch.zeros((self._view.count, 3, 3), device=self._device)
         self._data.image_shape = self.image_shape
@@ -564,13 +505,9 @@ def _update_poses(self, env_ids: Sequence[int]):
         # extract the position (convert it to SI units-- assumed that stage units is 1.0)
         self._data.pos_w[env_ids] = torch.tensor(prim_tf_all[:, 0:3, 3], device=self._device, dtype=torch.float32)
 
-        # save opengl convention
+        # save quat in world convention
         quat_opengl = quat_from_matrix(torch.tensor(prim_tf_all[:, 0:3, 0:3], device=self._device, dtype=torch.float32))
-        self._data.quat_w_opengl[env_ids] = quat_opengl
-
-        # save world and ros convention
         self._data.quat_w_world[env_ids] = convert_orientation_convention(quat_opengl, origin="opengl", target="world")
-        self._data.quat_w_ros[env_ids] = convert_orientation_convention(quat_opengl, origin="opengl", target="ros")
 
     def _create_annotator_data(self):
         """Create the buffers to store the annotator data.

diff --git a/source/extensions/omni.isaac.orbit/omni/isaac/orbit/sensors/camera/camera_data.py b/source/extensions/omni.isaac.orbit/omni/isaac/orbit/sensors/camera/camera_data.py
@@ -10,6 +10,8 @@
 from tensordict import TensorDict
 from typing import Any
 
+from .utils import convert_orientation_convention
+
 
 @dataclass
 class CameraData:
@@ -25,15 +27,6 @@ class CameraData:
     Shape is (N, 3) where ``N`` is the number of sensors.
     """
 
-    quat_w_ros: torch.Tensor = None
-    """Quaternion orientation `(w, x, y, z)` of the sensor origin in world frame, following ROS convention.
-
-    .. note::
-        ROS convention follows the camera aligned with forward axis +Z and up axis -Y.
-
-    Shape is (N, 4) where ``N`` is the number of sensors.
-    """
-
     quat_w_world: torch.Tensor = None
     """Quaternion orientation `(w, x, y, z)` of the sensor origin in world frame, following the world coordinate frame
 
@@ -43,15 +36,6 @@ class CameraData:
     Shape is ``(N, 4)`` where ``N`` is the number of sensors.
     """
 
-    quat_w_opengl: torch.Tensor = None
-    """Quaternion orientation `(w, x, y, z)` of the sensor origin in world frame, following Opengl / Usd.Camera convention
-
-    .. note::
-        OpenGL convention follows the camera aligned with forward axis -Z and up axis +Y.
-
-    Shape is ``(N, 4)`` where ``N`` is the number of sensors.
-    """
-
     ##
     # Camera data
     ##
@@ -81,3 +65,30 @@ class CameraData:
     For semantic-based data, this corresponds to the ``"info"`` key in the output of the sensor. For other sensor
     types, the info is empty.
     """
+
+    ##
+    # Additional Frame orientation conventions
+    ##
+
+    @property
+    def quat_w_ros(self) -> torch.Tensor:
+        """Quaternion orientation `(w, x, y, z)` of the sensor origin in the world frame, following ROS convention.
+
+        .. note::
+            ROS convention follows the camera aligned with forward axis +Z and up axis -Y.
+
+        Shape is (N, 4) where ``N`` is the number of sensors.
+        """
+        return convert_orientation_convention(self.quat_w_world, origin="world", target="ros")
+
+    @property
+    def quat_w_opengl(self) -> torch.Tensor:
+        """Quaternion orientation `(w, x, y, z)` of the sensor origin in the world frame, following
+        Opengl / USD Camera convention.
+
+        .. note::
+            OpenGL convention follows the camera aligned with forward axis -Z and up axis +Y.
+
+        Shape is ``(N, 4)`` where ``N`` is the number of sensors.
+        """
+        return convert_orientation_convention(self.quat_w_world, origin="world", target="opengl")