Merge pull request #20 from yuecideng/yuecideng/LabelMaker6D

Label Maker for Instance-level 6D Pose Estiamtion
yuecideng · Apr 26, 2022 · df47ee7 · df47ee7
2 parents 4c07588 + 685b3b1
commit df47ee7
Show file tree

Hide file tree

Showing 11 changed files with 1,061 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
@@ -41,5 +41,6 @@ __pycache__/
 *.pyc
 
 # App/Reconstruction folder
+dataset/
 fragments/
 scene/
diff --git a/app/camera_manager.py b/app/camera_manager.py
@@ -0,0 +1,220 @@
+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+
+import time
+import numpy as np
+import pyrealsense2 as rs
+from enum import IntEnum
+
+class RealSenseResolution(IntEnum):
+    Low = 0  # 480X270
+    Medium = 1  # 640X480
+    High = 2  # 848X480
+    Max = 3  # 1280X720
+
+
+class RealSenseFPS(IntEnum):
+    Low = 0  # 6
+    Medium = 1  # 15
+    High = 2  # 30
+    Max = 3  # 60
+
+
+class RealSenseManager:
+
+    def __init__(self,
+                 resolution=RealSenseResolution.High,
+                 fps=RealSenseFPS.High):
+        self.align = rs.align(rs.stream.color)
+        self.config = rs.config()
+
+        self.resolution = None
+        if resolution == RealSenseResolution.Low:
+            self.resolution = (480, 270)
+        elif resolution == RealSenseResolution.Medium:
+            self.resolution = (640, 480)
+        elif resolution == RealSenseResolution.High:
+            self.resolution = (848, 480)
+        else:
+            self.resolution = (1280, 720)
+
+        fps_ = None
+        if fps == RealSenseFPS.Low:
+            fps_ = 6
+        elif fps == RealSenseFPS.Medium:
+            fps_ = 15
+        elif fps == RealSenseFPS.High:
+            fps_ = 30
+        else:
+            fps_ = 60
+
+        self.config.enable_stream(rs.stream.depth, self.resolution[0],
+                                  self.resolution[1], rs.format.z16, fps_)
+        fps_color = 30 if fps_ > 30 else fps_
+        self.config.enable_stream(rs.stream.color, self.resolution[0],
+                                  self.resolution[1], rs.format.rgb8, fps_color)
+        self.pipeline = rs.pipeline()
+
+        self.profile = None
+        self.depth_profile = None
+        self.color_profile = None
+        self.sensor = None
+
+    def get_intrinsics(self, type='color'):
+        """ Get intrinsics of the RGB camera or IR camera, which are varied with resolution 
+
+        Args:
+            power ([string]): color or depth
+
+        Returns:
+            [tuple[List, List]]: K and dist
+        """
+        if self.sensor is None:
+            print('Sensor not opened!')
+            return None
+        intrin = None
+        if type == 'color':
+            intrin = self.profile.get_stream(
+                rs.stream.color).as_video_stream_profile().get_intrinsics()
+        else:
+            intrin = self.profile.get_stream(
+                rs.stream.depth).as_video_stream_profile().get_intrinsics()
+        K = [intrin.fx, intrin.fy, intrin.ppx, intrin.ppy]
+        dist = [
+            intrin.coeffs[0], intrin.coeffs[1], intrin.coeffs[2],
+            intrin.coeffs[3], intrin.coeffs[4]
+        ]
+        return (K, dist)
+
+    def get_extrinsics(self):
+        """ Get extrinsics from IR camera to RGB camera
+
+        Returns:
+            [np.ndarray(4 X 4)]: 
+        """
+
+        if self.sensor is None:
+            print('Sensor not opened!')
+            return None
+
+        res = self.depth_profile.get_extrinsics_to(self.color_profile)
+        rotation = np.array(res.rotation).reshape((3, 3))
+        translation = np.array(res.translation)
+        T = np.identity(4)
+        T[:3, :3] = rotation
+        T[:3, 3] = translation
+
+        return T
+
+    def get_resolution(self):
+        """ Get resolution of the camera
+
+        Returns:
+            tuple: (width, height)
+        """
+        return self.resolution
+
+    def set_laser_power(self, power):
+        """ Set laser power within range[10, 360])
+
+        Args:
+            power ([int]): laser power
+        """
+        power = max(10, min(360, power))
+        if self.sensor is None:
+            print('Sensor not opened!')
+            return
+        self.sensor.set_option(rs.option.laser_power, power)
+
+    def get_data(self, hole_fill=False):
+        """ Get data from the camera
+
+        Args:
+            hole_fill ([bool]): whether to fill the hole
+            vis ([bool]): whether to show the image
+
+        Returns:
+            [tuple[pyrealsense2.frame, pyrealsense2.frame]]: color and depth frames
+        """
+        if self.sensor is None:
+            print('Sensor not opened!')
+            return None
+
+        frames = self.pipeline.wait_for_frames()
+        aligned_frames = self.align.process(frames)
+        color_frame = aligned_frames.get_color_frame()
+        depth_frame = aligned_frames.get_depth_frame()
+        if not color_frame or not depth_frame:
+            print('Can not get data from realsense!')
+            return None
+        # set fill = 2 will use 4 pixels neighbor for hole filling
+        fill = 2 if hole_fill else 0
+        depth_frame = rs.spatial_filter(0.5, 20, 2, fill).process(depth_frame)
+
+        # convert to numpy array
+        color_image = np.asanyarray(color_frame.get_data())
+        depth_image = np.asanyarray(depth_frame.get_data())
+
+        return (color_image, depth_image)
+
+    def open(self):
+        """ Open the camera
+
+        Returns:
+            [bool]: 
+        """
+        self.profile = self.pipeline.start(self.config)
+        self.depth_profile = rs.video_stream_profile(
+            self.profile.get_stream(rs.stream.depth))
+        self.color_profile = rs.video_stream_profile(
+            self.profile.get_stream(rs.stream.color))
+        device = self.profile.get_device()
+        if device.query_sensors().__len__() == 0:
+            print('Can not find realsense sensor!')
+            return False
+        else:
+            self.sensor = device.query_sensors()[0]
+            # set default laser power
+            self.sensor.set_option(rs.option.laser_power, 200)
+            # set to high density mode
+            self.sensor.set_option(rs.option.visual_preset, 4)
+            return True
+
+    def close(self):
+        self.pipeline.stop()
+        self.sensor = None
+        self.profile = None
+        self.depth_profile = None
+        self.color_profile = None
+
+
+if __name__ == '__main__':
+    import cv2
+    import numpy as np
+
+    camera = RealSenseManager(resolution=RealSenseResolution.High,
+                              fps=RealSenseFPS.High)
+    camera.open()
+    try:
+        while True:
+            t0 = time.time()
+            color, depth = camera.get_data(hole_fill=False)
+            color_image = np.asanyarray(color.get_data())
+            depth_color_frame = rs.colorizer().colorize(depth)
+            depth_color_image = np.asanyarray(depth_color_frame.get_data())
+            color_image_bgr = cv2.cvtColor(color_image, cv2.COLOR_RGB2BGR)
+            t1 = time.time()
+            fps = 'FPS: ' + str(int(1 / (t1 - t0)))
+            cv2.putText(color_image_bgr, fps, (50, 50),
+                        cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2,
+                        cv2.LINE_AA)
+            cv2.namedWindow('color image', cv2.WINDOW_AUTOSIZE)
+            cv2.imshow('color image', color_image_bgr)
+            cv2.namedWindow('depth image', cv2.WINDOW_AUTOSIZE)
+            cv2.imshow('depth image', depth_color_image)
+            cv2.waitKey(1)
+    except KeyboardInterrupt:
+        camera.close()
+        cv2.destroyAllWindows()
+    finally:
+        print('Exit')
diff --git a/app/label_maker/README.md b/app/label_maker/README.md
@@ -0,0 +1,138 @@
+# Label Maker for 6D Pose Estimation
+This is a simple offline label maker for **instance-level** 6D pose estimation ground truth data generation, forming by several python scripts. The whole pipeline is as follows:
+- Collect RGBD data from sensor.
+- Run RGBD dense reconstruction.
+- Pose initialization.
+- Rendering mask and generate labels.
+
+## Requirements
+- `opencv-python`
+- `open3d`
+- `misc3d` (with `reconstruction` module enabled)
+
+## Pipeline
+### Step 1: Collect RGBD data
+We provide a simple script (`label_maker/record_data.py`) to collect RGBD data from RealSense RGBD camera. There are some arguments you can set in the script to control record mode and frame rate.
+
+You can also collect the data by yourself. But it is recommended to use the following structure:
+```
+dataset/
+    - color/
+        - 000000.png
+        - 000001.png
+        - ...
+    - depth/
+        - 000000.png
+        - 000001.png
+        - ...
+```
+
+**Note:**
+You should collect no more than 300 pairs of RGBD data for each dataset, otherwise the scene resonstruction in next step will be very slow. You can record your data by multiple times with different scene (different models, backgraound, etc.)
+
+### Step 2: Run RGBD dense reconstruction
+Run `python3 reconstruction.py config.json` to get the reconstructed scene. You must change the `camera` parameters in the `config.json` file to match your camera specification.
+
+You can use `draw_geometry.py` to visualize the reconstructed scene triangle mesh or point clouds.
+
+### Step 3: Pose initialization
+1. Prepare your model file with the following structure:
+    ```
+    model/
+        - 0.ply
+        - 1.ply
+        - 2.ply
+        - ...
+    ```
+    **You must use number as the model name to indicate the mask value of each frame.**
+2. Run `init_obj_pose.py --model_path <your/model/path> --data_path <your/data/path>`, and follow the instruction printed in the terminal.
+
+    After finish the process, you will find the `init_poses.json` in your data path.
+
+### Step 4: Rendering mask and generate labels
+Run `python3 generate_labels.py --model_path <your/model/path> --data_path <your/data/path>`. (add `--vis` to visualize the rendering instance mask)
+
+The results are 16 bit masks stored in `dataset/mask` and `json` file which contains 6d pose, bbox, object id and instance id. A minimal example with only one frame can be seen below:
+```json
+{
+    "000000": [
+        {
+            "obj_id": 0,
+            "instance_id": 0,
+            "cam_R_m2c": [
+                [
+                    0.9014091842603533,
+                    0.43197344750891603,
+                    -0.029332970840869017
+                ],
+                [
+                    0.22044418653117434,
+                    -0.5162045410566827,
+                    -0.8276093477100617
+                ],
+                [
+                    -0.3726470758716603,
+                    0.7395483841100113,
+                    -0.560537549504563
+                ]
+            ],
+            "cam_t_m2c": [
+                0.29020161109027537,
+                0.2501192190463131,
+                0.6792205163170392
+            ],
+            "bbox": [
+                498,
+                398,
+                142,
+                82
+            ]
+        },
+        {
+            "obj_id": 0,
+            "instance_id": 1,
+            "cam_R_m2c": [
+                [
+                    0.816165164729526,
+                    0.5773665735205735,
+                    -0.022853088700338718
+                ],
+                [
+                    0.30813881463069986,
+                    -0.4683609651098311,
+                    -0.828063087741133
+                ],
+                [
+                    -0.4887994423073907,
+                    0.6687943227499061,
+                    -0.5601689558137602
+                ]
+            ],
+            "cam_t_m2c": [
+                0.12174972304257478,
+                0.18991541206314635,
+                0.7773315438193125
+            ],
+            "bbox": [
+                350,
+                337,
+                144,
+                133
+            ]
+        }
+    ]
+}
+``` 
+
+The value for each instance object in the mask is equal to `v = obj_id * 1000 + instance_id`. You can decode each instance into a 8 bit mask image with single object contained using the following code:
+```python
+# you already have mask, obj_id and instance_id
+mask_new = np.zeros((mask.shape[0], mask.shape[1]), dtype=np.uint8)
+v = obj_id * 1000 + instance_id
+mask_new[mask == v] = 255
+``` 
+
+## Reference:
+LabelFusion 
+- [paper](https://ieeexplore.ieee.org/abstract/document/8460950)
+- [code](https://github.com/RobotLocomotion/LabelFusion)