From b90ca31f8e6df30cc8629f806e5fd46e7ff6f47c Mon Sep 17 00:00:00 2001
From: NicDionne <nicolas.dionne@hotmail.com>
Date: Wed, 16 Oct 2024 20:26:21 -0400
Subject: [PATCH 1/4] First iteration [CVAT integration] Use pixelwise masks,
 not polygons, for instance segmentation #4483

- We can upload mask
[] Missing test
[] Missing download mask
---
 fiftyone/utils/cvat.py | 114 +++++++++++++++++++++++++++++++----------
 1 file changed, 87 insertions(+), 27 deletions(-)

diff --git a/fiftyone/utils/cvat.py b/fiftyone/utils/cvat.py
index c30b6811dd..6c33d69ca8 100644
--- a/fiftyone/utils/cvat.py
+++ b/fiftyone/utils/cvat.py
@@ -1587,6 +1587,39 @@ def from_image_dict(cls, d):
         )
 
 
+class HasCVATBinMask:
+    @staticmethod
+    def cvat_rle_to_binary_image_mask(
+        cvat_rle, left, top, width, img_h: int, img_w: int
+    ) -> np.ndarray:
+        # Source https://github.com/cvat-ai/cvat/issues/6487#issuecomment-1640097518
+        # convert CVAT tight object RLE to COCO-style whole image mask
+        rle = cvat_rle
+        mask = np.zeros((img_h, img_w), dtype=np.uint8)
+        value = 0
+        offset = 0
+        for rle_count in rle:
+            while rle_count > 0:
+                y, x = divmod(offset, width)
+                mask[y + top][x + left] = value
+                rle_count -= 1
+                offset += 1
+            value = 1 - value
+
+        return mask
+
+    @staticmethod
+    def mask_to_cvat_rle(binary_mask: np.ndarray) -> np.array:
+        counts = []
+        for i, (value, elements) in enumerate(
+            itertools.groupby(binary_mask.ravel(order="C"))
+        ):
+            if i == 0 and value == 1:
+                counts.append(0)
+            counts.append(len(list(elements)))
+        return counts
+
+
 class HasCVATPoints(object):
     """Mixin for CVAT annotations that store a list of ``(x, y)`` pixel
     coordinates.
@@ -5905,7 +5938,7 @@ def _parse_annotation(
                 ):
                     # A piece of an instance mask
                     label_type = "detections"
-                    label = cvat_shape.to_polyline(closed=True, filled=True)
+                    label = cvat_shape.to_instance_detection()
                 else:
                     # A regular polyline or polygon
                     if expected_label_type in ("polyline", "polylines"):
@@ -6402,32 +6435,29 @@ def _create_detection_shapes(
             elif label_type in ("instance", "instances"):
                 if det.mask is None:
                     continue
-
-                polygon = det.to_polyline()
-                for points in polygon.points:
-                    if len(points) < 3:
-                        continue  # CVAT polygons must contain >= 3 points
-
-                    abs_points = HasCVATPoints._to_abs_points(
-                        points, frame_size
-                    )
-                    flattened_points = list(
-                        itertools.chain.from_iterable(abs_points)
-                    )
-
-                    curr_shapes.append(
-                        {
-                            "type": "polygon",
-                            "occluded": is_occluded,
-                            "z_order": 0,
-                            "points": flattened_points,
-                            "label_id": class_name,
-                            "group": group_id,
-                            "frame": frame_id,
-                            "source": "manual",
-                            "attributes": deepcopy(attributes),
-                        }
-                    )
+                x, y, _, _ = det.bounding_box
+                frame_width, frame_height = frame_size
+                mask_height, mask_width = det.mask.shape
+                xtl, ytl = round(x * frame_width), round(y * frame_height)
+                xbr, ybr = xtl + mask_width, ytl + mask_height
+
+                rle = HasCVATBinMask.mask_to_cvat_rle(det.mask)
+                rle.extend(  # Necessary as per CVAT API
+                    [xtl, ytl, xbr - 1, ybr - 1]
+                )
+                curr_shapes.append(
+                    {
+                        "type": "mask",
+                        "occluded": is_occluded,
+                        "z_order": 0,
+                        "points": rle,
+                        "label_id": class_name,
+                        "group": group_id,
+                        "frame": frame_id,
+                        "source": "manual",
+                        "attributes": deepcopy(attributes),
+                    }
+                )
 
             if not curr_shapes:
                 continue
@@ -7076,6 +7106,36 @@ def to_detection(self):
         self._set_attributes(label)
         return label
 
+    def to_instance_detection(self):
+        """Converts this shape to a :class:`fiftyone.core.labels.Detection`.
+        Special case where we also have a mask
+
+        Returns:
+            a :class:`fiftyone.core.labels.Detection`
+        """
+
+        xtl, ytl, xbr, ybr = self.points[-4:]
+        rel = self.points[:-4]
+        width, height = self.frame_size
+        mask = HasCVATBinMask.cvat_rle_to_binary_image_mask(
+            rel, top=ytl, left=xtl, width=xbr - xtl, img_h=height, img_w=width
+        )
+        cropped_mask = mask[ytl:ybr, xtl:xbr]
+        bbox = [
+            xtl / width,
+            ytl / height,
+            (xbr - xtl) / width,
+            (ybr - ytl) / height,
+        ]
+        label = fol.Detection(
+            label=self.label,
+            bounding_box=bbox,
+            index=self.index,
+            mask=cropped_mask,
+        )
+        self._set_attributes(label)
+        return label
+
     def to_polyline(self, closed=False, filled=False):
         """Converts this shape to a :class:`fiftyone.core.labels.Polyline`.
 

From 97b900057fcf0e9bbae7325a67e522f8edf49c31 Mon Sep 17 00:00:00 2001
From: NicDionne <nicolas.dionne@hotmail.com>
Date: Thu, 17 Oct 2024 20:20:18 -0400
Subject: [PATCH 2/4] Can now download annotation of mask

---
 fiftyone/utils/cvat.py | 71 ++++++++++++++++++++++++------------------
 1 file changed, 41 insertions(+), 30 deletions(-)

diff --git a/fiftyone/utils/cvat.py b/fiftyone/utils/cvat.py
index 6c33d69ca8..f918bc2023 100644
--- a/fiftyone/utils/cvat.py
+++ b/fiftyone/utils/cvat.py
@@ -1589,24 +1589,14 @@ def from_image_dict(cls, d):
 
 class HasCVATBinMask:
     @staticmethod
-    def cvat_rle_to_binary_image_mask(
-        cvat_rle, left, top, width, img_h: int, img_w: int
-    ) -> np.ndarray:
-        # Source https://github.com/cvat-ai/cvat/issues/6487#issuecomment-1640097518
-        # convert CVAT tight object RLE to COCO-style whole image mask
-        rle = cvat_rle
-        mask = np.zeros((img_h, img_w), dtype=np.uint8)
-        value = 0
-        offset = 0
-        for rle_count in rle:
-            while rle_count > 0:
-                y, x = divmod(offset, width)
-                mask[y + top][x + left] = value
-                rle_count -= 1
-                offset += 1
-            value = 1 - value
-
-        return mask
+    def rle_to_binary_image_mask(rle, mask_width, mask_height) -> np.ndarray:
+        mask = np.zeros(mask_width * mask_height, dtype=np.uint8)
+        counter = 0
+        for i, val in enumerate(rle):
+            if i % 2 == 1:
+                mask[counter : counter + val] = 1
+            counter += val
+        return mask.reshape(mask_width, mask_height)
 
     @staticmethod
     def mask_to_cvat_rle(binary_mask: np.ndarray) -> np.array:
@@ -5925,6 +5915,9 @@ def _parse_annotation(
             if shape_type == "rectangle":
                 label_type = "detections"
                 label = cvat_shape.to_detection()
+            elif shape_type == "mask":
+                label_type = "detections"
+                label = cvat_shape.to_instance_detection()
             elif shape_type == "polygon":
                 if expected_label_type == "segmentation":
                     # A piece of a segmentation mask
@@ -5938,7 +5931,7 @@ def _parse_annotation(
                 ):
                     # A piece of an instance mask
                     label_type = "detections"
-                    label = cvat_shape.to_instance_detection()
+                    label = cvat_shape.to_polyline(closed=True, filled=True)
                 else:
                     # A regular polyline or polygon
                     if expected_label_type in ("polyline", "polylines"):
@@ -6445,6 +6438,23 @@ def _create_detection_shapes(
                 rle.extend(  # Necessary as per CVAT API
                     [xtl, ytl, xbr - 1, ybr - 1]
                 )
+                print(
+                    xbr,
+                    frame_width,
+                    xbr / frame_width,
+                    type(xbr),
+                    type(frame_width),
+                )
+                print(
+                    " Beginning box : ",
+                    det.bounding_box,
+                    "mask_W : ",
+                    mask_width,
+                    "frame_size ",
+                    frame_size,
+                    "bbox : ",
+                    [xtl, ytl, xbr - 1, ybr - 1],
+                )
                 curr_shapes.append(
                     {
                         "type": "mask",
@@ -7113,25 +7123,26 @@ def to_instance_detection(self):
         Returns:
             a :class:`fiftyone.core.labels.Detection`
         """
-
         xtl, ytl, xbr, ybr = self.points[-4:]
-        rel = self.points[:-4]
-        width, height = self.frame_size
-        mask = HasCVATBinMask.cvat_rle_to_binary_image_mask(
-            rel, top=ytl, left=xtl, width=xbr - xtl, img_h=height, img_w=width
+        rel = np.array(self.points[:-4], dtype=int)
+        frame_width, frame_height = self.frame_size
+        mask = HasCVATBinMask.rle_to_binary_image_mask(
+            rel,
+            mask_width=round(xbr - xtl) + 1,
+            mask_height=round(ybr - ytl)
+            + 1,  # We need to add 1 because cvat uses - 1
         )
-        cropped_mask = mask[ytl:ybr, xtl:xbr]
         bbox = [
-            xtl / width,
-            ytl / height,
-            (xbr - xtl) / width,
-            (ybr - ytl) / height,
+            xtl / frame_width,
+            ytl / frame_height,
+            (xbr - xtl) / frame_width,
+            (ybr - ytl) / frame_height,
         ]
         label = fol.Detection(
             label=self.label,
             bounding_box=bbox,
             index=self.index,
-            mask=cropped_mask,
+            mask=mask,
         )
         self._set_attributes(label)
         return label

From 1df8997a577e57ea6cc487cff91b196979665986 Mon Sep 17 00:00:00 2001
From: NicDionne <nicolas.dionne@hotmail.com>
Date: Thu, 17 Oct 2024 20:55:28 -0400
Subject: [PATCH 3/4] Fix

---
 fiftyone/utils/cvat.py | 31 ++++++++++---------------------
 1 file changed, 10 insertions(+), 21 deletions(-)

diff --git a/fiftyone/utils/cvat.py b/fiftyone/utils/cvat.py
index f918bc2023..8e8d72e687 100644
--- a/fiftyone/utils/cvat.py
+++ b/fiftyone/utils/cvat.py
@@ -1592,11 +1592,16 @@ class HasCVATBinMask:
     def rle_to_binary_image_mask(rle, mask_width, mask_height) -> np.ndarray:
         mask = np.zeros(mask_width * mask_height, dtype=np.uint8)
         counter = 0
+
         for i, val in enumerate(rle):
             if i % 2 == 1:
                 mask[counter : counter + val] = 1
             counter += val
+
         return mask.reshape(mask_width, mask_height)
+        # mask = np.zeros(mask_width * mask_height, dtype=np.uint8)
+        # mask[np.add.accumulate(rle)[::2]] = 1
+        # return mask.reshape(mask_width, mask_height)
 
     @staticmethod
     def mask_to_cvat_rle(binary_mask: np.ndarray) -> np.array:
@@ -6438,23 +6443,6 @@ def _create_detection_shapes(
                 rle.extend(  # Necessary as per CVAT API
                     [xtl, ytl, xbr - 1, ybr - 1]
                 )
-                print(
-                    xbr,
-                    frame_width,
-                    xbr / frame_width,
-                    type(xbr),
-                    type(frame_width),
-                )
-                print(
-                    " Beginning box : ",
-                    det.bounding_box,
-                    "mask_W : ",
-                    mask_width,
-                    "frame_size ",
-                    frame_size,
-                    "bbox : ",
-                    [xtl, ytl, xbr - 1, ybr - 1],
-                )
                 curr_shapes.append(
                     {
                         "type": "mask",
@@ -7126,11 +7114,12 @@ def to_instance_detection(self):
         xtl, ytl, xbr, ybr = self.points[-4:]
         rel = np.array(self.points[:-4], dtype=int)
         frame_width, frame_height = self.frame_size
+        mask_w, mask_h = (
+            round(xbr - xtl) + 1,
+            round(ybr - ytl) + 1,
+        )  # We need to add 1 because cvat uses - 1
         mask = HasCVATBinMask.rle_to_binary_image_mask(
-            rel,
-            mask_width=round(xbr - xtl) + 1,
-            mask_height=round(ybr - ytl)
-            + 1,  # We need to add 1 because cvat uses - 1
+            rel, mask_width=mask_h, mask_height=mask_w
         )
         bbox = [
             xtl / frame_width,

From 513b6338101e34b6cdceb3773be8fb4fda9e3aa6 Mon Sep 17 00:00:00 2001
From: NicDionne <nicolas.dionne@hotmail.com>
Date: Thu, 17 Oct 2024 22:20:34 -0400
Subject: [PATCH 4/4] Fix code rabbit confusing variable name

---
 fiftyone/utils/cvat.py | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/fiftyone/utils/cvat.py b/fiftyone/utils/cvat.py
index 8e8d72e687..aeb1da1e5a 100644
--- a/fiftyone/utils/cvat.py
+++ b/fiftyone/utils/cvat.py
@@ -1598,10 +1598,7 @@ def rle_to_binary_image_mask(rle, mask_width, mask_height) -> np.ndarray:
                 mask[counter : counter + val] = 1
             counter += val
 
-        return mask.reshape(mask_width, mask_height)
-        # mask = np.zeros(mask_width * mask_height, dtype=np.uint8)
-        # mask[np.add.accumulate(rle)[::2]] = 1
-        # return mask.reshape(mask_width, mask_height)
+        return mask.reshape(mask_height, mask_width)
 
     @staticmethod
     def mask_to_cvat_rle(binary_mask: np.ndarray) -> np.array:
@@ -7119,7 +7116,7 @@ def to_instance_detection(self):
             round(ybr - ytl) + 1,
         )  # We need to add 1 because cvat uses - 1
         mask = HasCVATBinMask.rle_to_binary_image_mask(
-            rel, mask_width=mask_h, mask_height=mask_w
+            rel, mask_height=mask_h, mask_width=mask_w
         )
         bbox = [
             xtl / frame_width,