diff --git a/fiftyone/utils/cvat.py b/fiftyone/utils/cvat.py
index c30b6811dd..aeb1da1e5a 100644
--- a/fiftyone/utils/cvat.py
+++ b/fiftyone/utils/cvat.py
@@ -1587,6 +1587,31 @@ def from_image_dict(cls, d):
         )
 
 
+class HasCVATBinMask:
+    @staticmethod
+    def rle_to_binary_image_mask(rle, mask_width, mask_height) -> np.ndarray:
+        mask = np.zeros(mask_width * mask_height, dtype=np.uint8)
+        counter = 0
+
+        for i, val in enumerate(rle):
+            if i % 2 == 1:
+                mask[counter : counter + val] = 1
+            counter += val
+
+        return mask.reshape(mask_height, mask_width)
+
+    @staticmethod
+    def mask_to_cvat_rle(binary_mask: np.ndarray) -> np.array:
+        counts = []
+        for i, (value, elements) in enumerate(
+            itertools.groupby(binary_mask.ravel(order="C"))
+        ):
+            if i == 0 and value == 1:
+                counts.append(0)
+            counts.append(len(list(elements)))
+        return counts
+
+
 class HasCVATPoints(object):
     """Mixin for CVAT annotations that store a list of ``(x, y)`` pixel
     coordinates.
@@ -5892,6 +5917,9 @@ def _parse_annotation(
             if shape_type == "rectangle":
                 label_type = "detections"
                 label = cvat_shape.to_detection()
+            elif shape_type == "mask":
+                label_type = "detections"
+                label = cvat_shape.to_instance_detection()
             elif shape_type == "polygon":
                 if expected_label_type == "segmentation":
                     # A piece of a segmentation mask
@@ -6402,32 +6430,29 @@ def _create_detection_shapes(
             elif label_type in ("instance", "instances"):
                 if det.mask is None:
                     continue
-
-                polygon = det.to_polyline()
-                for points in polygon.points:
-                    if len(points) < 3:
-                        continue  # CVAT polygons must contain >= 3 points
-
-                    abs_points = HasCVATPoints._to_abs_points(
-                        points, frame_size
-                    )
-                    flattened_points = list(
-                        itertools.chain.from_iterable(abs_points)
-                    )
-
-                    curr_shapes.append(
-                        {
-                            "type": "polygon",
-                            "occluded": is_occluded,
-                            "z_order": 0,
-                            "points": flattened_points,
-                            "label_id": class_name,
-                            "group": group_id,
-                            "frame": frame_id,
-                            "source": "manual",
-                            "attributes": deepcopy(attributes),
-                        }
-                    )
+                x, y, _, _ = det.bounding_box
+                frame_width, frame_height = frame_size
+                mask_height, mask_width = det.mask.shape
+                xtl, ytl = round(x * frame_width), round(y * frame_height)
+                xbr, ybr = xtl + mask_width, ytl + mask_height
+
+                rle = HasCVATBinMask.mask_to_cvat_rle(det.mask)
+                rle.extend(  # Necessary as per CVAT API
+                    [xtl, ytl, xbr - 1, ybr - 1]
+                )
+                curr_shapes.append(
+                    {
+                        "type": "mask",
+                        "occluded": is_occluded,
+                        "z_order": 0,
+                        "points": rle,
+                        "label_id": class_name,
+                        "group": group_id,
+                        "frame": frame_id,
+                        "source": "manual",
+                        "attributes": deepcopy(attributes),
+                    }
+                )
 
             if not curr_shapes:
                 continue
@@ -7076,6 +7101,38 @@ def to_detection(self):
         self._set_attributes(label)
         return label
 
+    def to_instance_detection(self):
+        """Converts this shape to a :class:`fiftyone.core.labels.Detection`.
+        Special case where we also have a mask
+
+        Returns:
+            a :class:`fiftyone.core.labels.Detection`
+        """
+        xtl, ytl, xbr, ybr = self.points[-4:]
+        rel = np.array(self.points[:-4], dtype=int)
+        frame_width, frame_height = self.frame_size
+        mask_w, mask_h = (
+            round(xbr - xtl) + 1,
+            round(ybr - ytl) + 1,
+        )  # We need to add 1 because cvat uses - 1
+        mask = HasCVATBinMask.rle_to_binary_image_mask(
+            rel, mask_height=mask_h, mask_width=mask_w
+        )
+        bbox = [
+            xtl / frame_width,
+            ytl / frame_height,
+            (xbr - xtl) / frame_width,
+            (ybr - ytl) / frame_height,
+        ]
+        label = fol.Detection(
+            label=self.label,
+            bounding_box=bbox,
+            index=self.index,
+            mask=mask,
+        )
+        self._set_attributes(label)
+        return label
+
     def to_polyline(self, closed=False, filled=False):
         """Converts this shape to a :class:`fiftyone.core.labels.Polyline`.