diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md index e90992ca29..e59d3de8d4 100644 --- a/CONTRIBUTORS.md +++ b/CONTRIBUTORS.md @@ -50,6 +50,7 @@ Guidelines for modifications: * Jan Kerner * Jean Tampon * Jia Lin Yuan +* Jinghuan Shang * Jingzhou Liu * Johnson Sun * Kaixi Bao diff --git a/docs/licenses/dependencies/einops-license.txt b/docs/licenses/dependencies/einops-license.txt new file mode 100644 index 0000000000..3a654e9066 --- /dev/null +++ b/docs/licenses/dependencies/einops-license.txt @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2018 Alex Rogozhnikov + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/docs/licenses/dependencies/transformers-license.txt b/docs/licenses/dependencies/transformers-license.txt new file mode 100644 index 0000000000..68b7d66c97 --- /dev/null +++ b/docs/licenses/dependencies/transformers-license.txt @@ -0,0 +1,203 @@ +Copyright 2018- The Hugging Face team. All rights reserved. + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/docs/source/_static/refs.bib b/docs/source/_static/refs.bib index e7f82d17da..c87c90c076 100644 --- a/docs/source/_static/refs.bib +++ b/docs/source/_static/refs.bib @@ -139,3 +139,18 @@ @article{mittal2023orbit pages={3740-3747}, doi={10.1109/LRA.2023.3270034} } + +@article{shang2024theia, + title={Theia: Distilling diverse vision foundation models for robot learning}, + author={Shang, Jinghuan and Schmeckpeper, Karl and May, Brandon B and Minniti, Maria Vittoria and Kelestemur, Tarik and Watkins, David and Herlant, Laura}, + journal={arXiv preprint arXiv:2407.20179}, + year={2024} +} + +@inproceedings{he2016deep, + title={Deep residual learning for image recognition}, + author={He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun, Jian}, + booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition}, + pages={770--778}, + year={2016} +} diff --git a/source/apps/isaaclab.python.headless.rendering.kit b/source/apps/isaaclab.python.headless.rendering.kit index 5d14c551dc..0e808021ee 100644 --- a/source/apps/isaaclab.python.headless.rendering.kit +++ b/source/apps/isaaclab.python.headless.rendering.kit @@ -35,6 +35,10 @@ app.folder = "${exe-path}/" app.name = "Isaac-Sim" app.version = "4.2.0" +# Disable print outs on extension startup information +# this only disables the app print_and_log function +app.enableStdoutOutput = false + # set the default ros bridge to disable on startup isaac.startup.ros_bridge_extension = "" diff --git a/source/apps/isaaclab.python.rendering.kit b/source/apps/isaaclab.python.rendering.kit index 539b109a19..abd272a5fe 100644 --- a/source/apps/isaaclab.python.rendering.kit +++ b/source/apps/isaaclab.python.rendering.kit @@ -35,6 +35,10 @@ app.folder = "${exe-path}/" app.name = "Isaac-Sim" app.version = "4.2.0" +# Disable print outs on extension startup information +# this only disables the app print_and_log function +app.enableStdoutOutput = false + # set the default ros bridge to disable on startup isaac.startup.ros_bridge_extension = "" diff --git a/source/extensions/omni.isaac.lab/omni/isaac/lab/envs/mdp/observations.py b/source/extensions/omni.isaac.lab/omni/isaac/lab/envs/mdp/observations.py index aca0f579ce..2c97cf151e 100644 --- a/source/extensions/omni.isaac.lab/omni/isaac/lab/envs/mdp/observations.py +++ b/source/extensions/omni.isaac.lab/omni/isaac/lab/envs/mdp/observations.py @@ -186,42 +186,46 @@ def body_incoming_wrench(env: ManagerBasedEnv, asset_cfg: SceneEntityCfg) -> tor def imu_orientation(env: ManagerBasedEnv, asset_cfg: SceneEntityCfg = SceneEntityCfg("imu")) -> torch.Tensor: - """Imu sensor orientation w.r.t the env.scene.origin. + """Imu sensor orientation in the simulation world frame. Args: env: The environment. - asset_cfg: The SceneEntity associated with an Imu sensor. + asset_cfg: The SceneEntity associated with an IMU sensor. Defaults to SceneEntityCfg("imu"). Returns: - Orientation quaternion (wxyz), shape of torch.tensor is (num_env,4). + Orientation in the world frame in (w, x, y, z) quaternion form. Shape is (num_envs, 4). """ + # extract the used quantities (to enable type-hinting) asset: Imu = env.scene[asset_cfg.name] + # return the orientation quaternion return asset.data.quat_w def imu_ang_vel(env: ManagerBasedEnv, asset_cfg: SceneEntityCfg = SceneEntityCfg("imu")) -> torch.Tensor: - """Imu sensor angular velocity w.r.t. env.scene.origin expressed in the sensor frame. + """Imu sensor angular velocity w.r.t. environment origin expressed in the sensor frame. Args: env: The environment. - asset_cfg: The SceneEntity associated with an Imu sensor. + asset_cfg: The SceneEntity associated with an IMU sensor. Defaults to SceneEntityCfg("imu"). Returns: - Angular velocity (rad/s), shape of torch.tensor is (num_env,3). + The angular velocity (rad/s) in the sensor frame. Shape is (num_envs, 3). """ + # extract the used quantities (to enable type-hinting) asset: Imu = env.scene[asset_cfg.name] + # return the angular velocity return asset.data.ang_vel_b def imu_lin_acc(env: ManagerBasedEnv, asset_cfg: SceneEntityCfg = SceneEntityCfg("imu")) -> torch.Tensor: - """Imu sensor linear acceleration w.r.t. env.scene.origin expressed in sensor frame. + """Imu sensor linear acceleration w.r.t. the environment origin expressed in sensor frame. Args: env: The environment. - asset_cfg: The SceneEntity associated with an Imu sensor. + asset_cfg: The SceneEntity associated with an IMU sensor. Defaults to SceneEntityCfg("imu"). Returns: - linear acceleration (m/s^2), shape of torch.tensor is (num_env,3). + The linear acceleration (m/s^2) in the sensor frame. Shape is (num_envs, 3). """ asset: Imu = env.scene[asset_cfg.name] return asset.data.lin_acc_b @@ -279,42 +283,56 @@ def image( class image_features(ManagerTermBase): """Extracted image features from a pre-trained frozen encoder. - This method calls the :meth:`image` function to retrieve images, and then performs - inference on those images. + This term uses models from the model zoo in PyTorch and extracts features from the images. + + It calls the :func:`image` function to get the images and then processes them using the model zoo. + + A user can provide their own model zoo configuration to use different models for feature extraction. + The model zoo configuration should be a dictionary that maps different model names to a dictionary + that defines the model, preprocess and inference functions. The dictionary should have the following + entries: + + - "model": A callable that returns the model when invoked without arguments. + - "reset": A callable that resets the model. This is useful when the model has a state that needs to be reset. + - "inference": A callable that, when given the model and the images, returns the extracted features. + + If the model zoo configuration is not provided, the default model zoo configurations are used. The default + model zoo configurations include the models from Theia :cite:`shang2024theia` and ResNet :cite:`he2016deep`. + These models are loaded from `Hugging-Face transformers `_ and + `PyTorch torchvision `_ respectively. + + Args: + sensor_cfg: The sensor configuration to poll. Defaults to SceneEntityCfg("tiled_camera"). + data_type: The sensor data type. Defaults to "rgb". + convert_perspective_to_orthogonal: Whether to orthogonalize perspective depth images. + This is used only when the data type is "distance_to_camera". Defaults to False. + model_zoo_cfg: A user-defined dictionary that maps different model names to their respective configurations. + Defaults to None. If None, the default model zoo configurations are used. + model_name: The name of the model to use for inference. Defaults to "resnet18". + model_device: The device to store and infer the model on. This is useful when offloading the computation + from the environment simulation device. Defaults to the environment device. + inference_kwargs: Additional keyword arguments to pass to the inference function. Defaults to None, + which means no additional arguments are passed. + + Returns: + The extracted features tensor. Shape is (num_envs, feature_dim). + + Raises: + ValueError: When the model name is not found in the provided model zoo configuration. + ValueError: When the model name is not found in the default model zoo configuration. """ def __init__(self, cfg: ObservationTermCfg, env: ManagerBasedEnv): + # initialize the base class super().__init__(cfg, env) - from torchvision import models - from transformers import AutoModel - def create_theia_model(model_name): - return { - "model": ( - lambda: AutoModel.from_pretrained(f"theaiinstitute/{model_name}", trust_remote_code=True) - .eval() - .to("cuda:0") - ), - "preprocess": lambda img: (img - torch.amin(img, dim=(1, 2), keepdim=True)) / ( - torch.amax(img, dim=(1, 2), keepdim=True) - torch.amin(img, dim=(1, 2), keepdim=True) - ), - "inference": lambda model, images: model.forward_feature( - images, do_rescale=False, interpolate_pos_encoding=True - ), - } - - def create_resnet_model(resnet_name): - return { - "model": lambda: getattr(models, resnet_name)(pretrained=True).eval().to("cuda:0"), - "preprocess": lambda img: ( - img.permute(0, 3, 1, 2) # Convert [batch, height, width, 3] -> [batch, 3, height, width] - - torch.tensor([0.485, 0.456, 0.406], device=img.device).view(1, 3, 1, 1) - ) / torch.tensor([0.229, 0.224, 0.225], device=img.device).view(1, 3, 1, 1), - "inference": lambda model, images: model(images), - } + # extract parameters from the configuration + self.model_zoo_cfg: dict = cfg.params.get("model_zoo_cfg") # type: ignore + self.model_name: str = cfg.params.get("model_name", "resnet18") # type: ignore + self.model_device: str = cfg.params.get("model_device", env.device) # type: ignore - # List of Theia models - theia_models = [ + # List of Theia models - These are configured through `_prepare_theia_transformer_model` function + default_theia_models = [ "theia-tiny-patch16-224-cddsv", "theia-tiny-patch16-224-cdiv", "theia-small-patch16-224-cdiv", @@ -322,22 +340,43 @@ def create_resnet_model(resnet_name): "theia-small-patch16-224-cddsv", "theia-base-patch16-224-cddsv", ] - - # List of ResNet models - resnet_models = ["resnet18", "resnet34", "resnet50", "resnet101"] - - self.default_model_zoo_cfg = {} - - # Add Theia models to the zoo - for model_name in theia_models: - self.default_model_zoo_cfg[model_name] = create_theia_model(model_name) - - # Add ResNet models to the zoo - for resnet_name in resnet_models: - self.default_model_zoo_cfg[resnet_name] = create_resnet_model(resnet_name) - - self.model_zoo_cfg = self.default_model_zoo_cfg - self.model_zoo = {} + # List of ResNet models - These are configured through `_prepare_resnet_model` function + default_resnet_models = ["resnet18", "resnet34", "resnet50", "resnet101"] + + # Check if model name is specified in the model zoo configuration + if self.model_zoo_cfg is not None and self.model_name not in self.model_zoo_cfg: + raise ValueError( + f"Model name '{self.model_name}' not found in the provided model zoo configuration." + " Please add the model to the model zoo configuration or use a different model name." + f" Available models in the provided list: {list(self.model_zoo_cfg.keys())}." + "\nHint: If you want to use a default model, consider using one of the following models:" + f" {default_theia_models + default_resnet_models}. In this case, you can remove the" + " 'model_zoo_cfg' parameter from the observation term configuration." + ) + if self.model_zoo_cfg is None: + if self.model_name in default_theia_models: + model_config = self._prepare_theia_transformer_model(self.model_name, self.model_device) + elif self.model_name in default_resnet_models: + model_config = self._prepare_resnet_model(self.model_name, self.model_device) + else: + raise ValueError( + f"Model name '{self.model_name}' not found in the default model zoo configuration." + f" Available models: {default_theia_models + default_resnet_models}." + ) + else: + model_config = self.model_zoo_cfg[self.model_name] + + # Retrieve the model, preprocess and inference functions + self._model = model_config["model"]() + self._reset_fn = model_config.get("reset") + self._inference_fn = model_config["inference"] + + def reset(self, env_ids: torch.Tensor | None = None): + # reset the model if a reset function is provided + # this might be useful when the model has a state that needs to be reset + # for example: video transformers + if self._reset_fn is not None: + self._reset_fn(self._model, env_ids) def __call__( self, @@ -346,62 +385,123 @@ def __call__( data_type: str = "rgb", convert_perspective_to_orthogonal: bool = False, model_zoo_cfg: dict | None = None, - model_name: str = "ResNet18", - model_device: str | None = "cuda:0", - reset_model: bool = False, + model_name: str = "resnet18", + model_device: str | None = None, + inference_kwargs: dict | None = None, ) -> torch.Tensor: - """Extracted image features from a pre-trained frozen encoder. + # obtain the images from the sensor + image_data = image( + env=env, + sensor_cfg=sensor_cfg, + data_type=data_type, + convert_perspective_to_orthogonal=convert_perspective_to_orthogonal, + normalize=False, # we pre-process based on model + ) + # store the device of the image + image_device = image_data.device + # forward the images through the model + features = self._inference_fn(self._model, image_data, **(inference_kwargs or {})) + + # move the features back to the image device + return features.detach().to(image_device) + + """ + Helper functions. + """ + + def _prepare_theia_transformer_model(self, model_name: str, model_device: str) -> dict: + """Prepare the Theia transformer model for inference. Args: - env: The environment. - sensor_cfg: The sensor configuration to poll. Defaults to SceneEntityCfg("tiled_camera"). - data_type: THe sensor configuration datatype. Defaults to "rgb". - convert_perspective_to_orthogonal: Whether to orthogonalize perspective depth images. - This is used only when the data type is "distance_to_camera". Defaults to False. - model_zoo_cfg: Map from model name to model configuration dictionary. Each model - configuration dictionary should include the following entries: - - "model": A callable that returns the model when invoked without arguments. - - "preprocess": A callable that processes the images and returns the preprocessed results. - - "inference": A callable that, when given the model and preprocessed images, - returns the extracted features. - model_name: The name of the model to use for inference. Defaults to "ResNet18". - model_device: The device to store and infer models on. This can be used help offload - computation from the main environment GPU. Defaults to "cuda:0". - reset_model: Initialize the model even if it already exists. Defaults to False. + model_name: The name of the Theia transformer model to prepare. + model_device: The device to store and infer the model on. Returns: - torch.Tensor: the image features, on the same device as the image + A dictionary containing the model and inference functions. """ - if model_zoo_cfg is not None: # use other than default - self.model_zoo_cfg.update(model_zoo_cfg) + from transformers import AutoModel - if model_name not in self.model_zoo or reset_model: - # The following allows to only load a desired subset of a model zoo into GPU memory - # as it becomes needed, in a "lazy" evaluation. - print(f"[INFO]: Adding {model_name} to the model zoo") - self.model_zoo[model_name] = self.model_zoo_cfg[model_name]["model"]() + def _load_model() -> torch.nn.Module: + """Load the Theia transformer model.""" + model = AutoModel.from_pretrained(f"theaiinstitute/{model_name}", trust_remote_code=True).eval() + return model.to(model_device) - if model_device is not None and self.model_zoo[model_name].device != model_device: - # want to offload vision model inference to another device - self.model_zoo[model_name] = self.model_zoo[model_name].to(model_device) + def _inference(model, images: torch.Tensor) -> torch.Tensor: + """Inference the Theia transformer model. - images = image( - env=env, - sensor_cfg=sensor_cfg, - data_type=data_type, - convert_perspective_to_orthogonal=convert_perspective_to_orthogonal, - normalize=True, # want this for training stability - ) + Args: + model: The Theia transformer model. + images: The preprocessed image tensor. Shape is (num_envs, height, width, channel). + + Returns: + The extracted features tensor. Shape is (num_envs, feature_dim). + """ + # Move the image to the model device + image_proc = images.to(model_device) + # permute the image to (num_envs, channel, height, width) + image_proc = image_proc.permute(0, 3, 1, 2).float() / 255.0 + # Normalize the image + mean = torch.tensor([0.485, 0.456, 0.406], device=model_device).view(1, 3, 1, 1) + std = torch.tensor([0.229, 0.224, 0.225], device=model_device).view(1, 3, 1, 1) + image_proc = (image_proc - mean) / std + + # Taken from Transformers; inference converted to be GPU only + features = model.backbone.model(pixel_values=image_proc, interpolate_pos_encoding=True) + return features.last_hidden_state[:, 1:] + + # return the model, preprocess and inference functions + return {"model": _load_model, "inference": _inference} - image_device = images.device + def _prepare_resnet_model(self, model_name: str, model_device: str) -> dict: + """Prepare the ResNet model for inference. - if model_device is not None: - images = images.to(model_device) + Args: + model_name: The name of the ResNet model to prepare. + model_device: The device to store and infer the model on. + + Returns: + A dictionary containing the model and inference functions. + """ + from torchvision import models - proc_images = self.model_zoo_cfg[model_name]["preprocess"](images) - features = self.model_zoo_cfg[model_name]["inference"](self.model_zoo[model_name], proc_images) + def _load_model() -> torch.nn.Module: + """Load the ResNet model.""" + # map the model name to the weights + resnet_weights = { + "resnet18": "ResNet18_Weights.IMAGENET1K_V1", + "resnet34": "ResNet34_Weights.IMAGENET1K_V1", + "resnet50": "ResNet50_Weights.IMAGENET1K_V1", + "resnet101": "ResNet101_Weights.IMAGENET1K_V1", + } - return features.to(image_device).clone() + # load the model + model = getattr(models, model_name)(weights=resnet_weights[model_name]).eval() + return model.to(model_device) + + def _inference(model, images: torch.Tensor) -> torch.Tensor: + """Inference the ResNet model. + + Args: + model: The ResNet model. + images: The preprocessed image tensor. Shape is (num_envs, channel, height, width). + + Returns: + The extracted features tensor. Shape is (num_envs, feature_dim). + """ + # move the image to the model device + image_proc = images.to(model_device) + # permute the image to (num_envs, channel, height, width) + image_proc = image_proc.permute(0, 3, 1, 2).float() / 255.0 + # normalize the image + mean = torch.tensor([0.485, 0.456, 0.406], device=model_device).view(1, 3, 1, 1) + std = torch.tensor([0.229, 0.224, 0.225], device=model_device).view(1, 3, 1, 1) + image_proc = (image_proc - mean) / std + + # forward the image through the model + return model(image_proc) + + # return the model, preprocess and inference functions + return {"model": _load_model, "inference": _inference} """ diff --git a/source/extensions/omni.isaac.lab/setup.py b/source/extensions/omni.isaac.lab/setup.py index 3b68d3c84f..01229c8c4f 100644 --- a/source/extensions/omni.isaac.lab/setup.py +++ b/source/extensions/omni.isaac.lab/setup.py @@ -31,6 +31,9 @@ # procedural-generation "trimesh", "pyglet<2", + # image processing + "transformers", + "einops", # needed for transformers, doesn't always auto-install ] PYTORCH_INDEX_URL = ["https://download.pytorch.org/whl/cu118"] diff --git a/source/extensions/omni.isaac.lab_tasks/omni/isaac/lab_tasks/manager_based/classic/cartpole/agents/rl_games_feature_ppo_cfg.yaml b/source/extensions/omni.isaac.lab_tasks/omni/isaac/lab_tasks/manager_based/classic/cartpole/agents/rl_games_feature_ppo_cfg.yaml index 18e0ffd022..41e265e9f2 100644 --- a/source/extensions/omni.isaac.lab_tasks/omni/isaac/lab_tasks/manager_based/classic/cartpole/agents/rl_games_feature_ppo_cfg.yaml +++ b/source/extensions/omni.isaac.lab_tasks/omni/isaac/lab_tasks/manager_based/classic/cartpole/agents/rl_games_feature_ppo_cfg.yaml @@ -63,7 +63,7 @@ params: lr_schedule: adaptive kl_threshold: 0.008 score_to_win: 20000 - max_epochs: 5000 + max_epochs: 200 save_best_after: 50 save_frequency: 25 grad_norm: 1.0 diff --git a/source/extensions/omni.isaac.lab_tasks/omni/isaac/lab_tasks/manager_based/classic/cartpole/cartpole_camera_env_cfg.py b/source/extensions/omni.isaac.lab_tasks/omni/isaac/lab_tasks/manager_based/classic/cartpole/cartpole_camera_env_cfg.py index f767a21962..815178a61a 100644 --- a/source/extensions/omni.isaac.lab_tasks/omni/isaac/lab_tasks/manager_based/classic/cartpole/cartpole_camera_env_cfg.py +++ b/source/extensions/omni.isaac.lab_tasks/omni/isaac/lab_tasks/manager_based/classic/cartpole/cartpole_camera_env_cfg.py @@ -148,16 +148,13 @@ class CartpoleDepthCameraEnvCfg(CartpoleEnvCfg): @configclass class CartpoleResNet18CameraEnvCfg(CartpoleRGBCameraEnvCfg): + """Configuration for the cartpole environment with ResNet18 features as observations.""" + observations: ResNet18ObservationCfg = ResNet18ObservationCfg() @configclass class CartpoleTheiaTinyCameraEnvCfg(CartpoleRGBCameraEnvCfg): - """ - Due to TheiaTiny's size in GPU memory, we reduce the number of environments by default. - This helps reduce the possibility of crashing on more modest hardware. - The following configuration uses ~12gb VRAM at peak. - """ + """Configuration for the cartpole environment with Theia-Tiny features as observations.""" - scene: CartpoleSceneCfg = CartpoleRGBCameraSceneCfg(num_envs=128, env_spacing=20) observations: TheiaTinyObservationCfg = TheiaTinyObservationCfg()