diff --git a/fiftyone/utils/cvat.py b/fiftyone/utils/cvat.py index c30b6811dd..aeb1da1e5a 100644 --- a/fiftyone/utils/cvat.py +++ b/fiftyone/utils/cvat.py @@ -1587,6 +1587,31 @@ def from_image_dict(cls, d): ) +class HasCVATBinMask: + @staticmethod + def rle_to_binary_image_mask(rle, mask_width, mask_height) -> np.ndarray: + mask = np.zeros(mask_width * mask_height, dtype=np.uint8) + counter = 0 + + for i, val in enumerate(rle): + if i % 2 == 1: + mask[counter : counter + val] = 1 + counter += val + + return mask.reshape(mask_height, mask_width) + + @staticmethod + def mask_to_cvat_rle(binary_mask: np.ndarray) -> np.array: + counts = [] + for i, (value, elements) in enumerate( + itertools.groupby(binary_mask.ravel(order="C")) + ): + if i == 0 and value == 1: + counts.append(0) + counts.append(len(list(elements))) + return counts + + class HasCVATPoints(object): """Mixin for CVAT annotations that store a list of ``(x, y)`` pixel coordinates. @@ -5892,6 +5917,9 @@ def _parse_annotation( if shape_type == "rectangle": label_type = "detections" label = cvat_shape.to_detection() + elif shape_type == "mask": + label_type = "detections" + label = cvat_shape.to_instance_detection() elif shape_type == "polygon": if expected_label_type == "segmentation": # A piece of a segmentation mask @@ -6402,32 +6430,29 @@ def _create_detection_shapes( elif label_type in ("instance", "instances"): if det.mask is None: continue - - polygon = det.to_polyline() - for points in polygon.points: - if len(points) < 3: - continue # CVAT polygons must contain >= 3 points - - abs_points = HasCVATPoints._to_abs_points( - points, frame_size - ) - flattened_points = list( - itertools.chain.from_iterable(abs_points) - ) - - curr_shapes.append( - { - "type": "polygon", - "occluded": is_occluded, - "z_order": 0, - "points": flattened_points, - "label_id": class_name, - "group": group_id, - "frame": frame_id, - "source": "manual", - "attributes": deepcopy(attributes), - } - ) + x, y, _, _ = det.bounding_box + frame_width, frame_height = frame_size + mask_height, mask_width = det.mask.shape + xtl, ytl = round(x * frame_width), round(y * frame_height) + xbr, ybr = xtl + mask_width, ytl + mask_height + + rle = HasCVATBinMask.mask_to_cvat_rle(det.mask) + rle.extend( # Necessary as per CVAT API + [xtl, ytl, xbr - 1, ybr - 1] + ) + curr_shapes.append( + { + "type": "mask", + "occluded": is_occluded, + "z_order": 0, + "points": rle, + "label_id": class_name, + "group": group_id, + "frame": frame_id, + "source": "manual", + "attributes": deepcopy(attributes), + } + ) if not curr_shapes: continue @@ -7076,6 +7101,38 @@ def to_detection(self): self._set_attributes(label) return label + def to_instance_detection(self): + """Converts this shape to a :class:`fiftyone.core.labels.Detection`. + Special case where we also have a mask + + Returns: + a :class:`fiftyone.core.labels.Detection` + """ + xtl, ytl, xbr, ybr = self.points[-4:] + rel = np.array(self.points[:-4], dtype=int) + frame_width, frame_height = self.frame_size + mask_w, mask_h = ( + round(xbr - xtl) + 1, + round(ybr - ytl) + 1, + ) # We need to add 1 because cvat uses - 1 + mask = HasCVATBinMask.rle_to_binary_image_mask( + rel, mask_height=mask_h, mask_width=mask_w + ) + bbox = [ + xtl / frame_width, + ytl / frame_height, + (xbr - xtl) / frame_width, + (ybr - ytl) / frame_height, + ] + label = fol.Detection( + label=self.label, + bounding_box=bbox, + index=self.index, + mask=mask, + ) + self._set_attributes(label) + return label + def to_polyline(self, closed=False, filled=False): """Converts this shape to a :class:`fiftyone.core.labels.Polyline`.